]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[cleanup Misc
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
019a94f7 21import importlib.util
b1f94422 22import inspect
03f9daab 23import io
79a2e94e 24import itertools
f4bfd65f 25import json
d77c3dfd 26import locale
02dbf93f 27import math
f8271158 28import mimetypes
347de493 29import operator
d77c3dfd 30import os
c496ca96 31import platform
773f291d 32import random
d77c3dfd 33import re
f8271158 34import shlex
c496ca96 35import socket
79a2e94e 36import ssl
ac668111 37import struct
1c088fa8 38import subprocess
d77c3dfd 39import sys
181c8655 40import tempfile
c380cc28 41import time
01951dda 42import traceback
64fa820c 43import types
989a01c2 44import unicodedata
14f25df2 45import urllib.error
f8271158 46import urllib.parse
ac668111 47import urllib.request
bcf89ce6 48import xml.etree.ElementTree
d77c3dfd 49import zlib
d77c3dfd 50
6929b41a 51from .compat import functools # isort: split
8c25f81b 52from .compat import (
36e6f62c 53 compat_etree_fromstring,
51098426 54 compat_expanduser,
f8271158 55 compat_HTMLParseError,
efa97bdc 56 compat_os_name,
702ccf2d 57 compat_shlex_quote,
8c25f81b 58)
ac668111 59from .dependencies import brotli, certifi, websockets, xattr
f8271158 60from .socks import ProxyType, sockssocket
71aff188 61
4644ac55 62
51fb4995
YCH
63def register_socks_protocols():
64 # "Register" SOCKS protocols
d5ae6bb5
YCH
65 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
66 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 67 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 68 if scheme not in urllib.parse.uses_netloc:
69 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
70
71
468e2e92
FV
72# This is not clearly defined otherwise
73compiled_regex_type = type(re.compile(''))
74
f7a147e3
S
75
76def random_user_agent():
77 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
78 _CHROME_VERSIONS = (
19b4c74d 79 '90.0.4430.212',
80 '90.0.4430.24',
81 '90.0.4430.70',
82 '90.0.4430.72',
83 '90.0.4430.85',
84 '90.0.4430.93',
85 '91.0.4472.101',
86 '91.0.4472.106',
87 '91.0.4472.114',
88 '91.0.4472.124',
89 '91.0.4472.164',
90 '91.0.4472.19',
91 '91.0.4472.77',
92 '92.0.4515.107',
93 '92.0.4515.115',
94 '92.0.4515.131',
95 '92.0.4515.159',
96 '92.0.4515.43',
97 '93.0.4556.0',
98 '93.0.4577.15',
99 '93.0.4577.63',
100 '93.0.4577.82',
101 '94.0.4606.41',
102 '94.0.4606.54',
103 '94.0.4606.61',
104 '94.0.4606.71',
105 '94.0.4606.81',
106 '94.0.4606.85',
107 '95.0.4638.17',
108 '95.0.4638.50',
109 '95.0.4638.54',
110 '95.0.4638.69',
111 '95.0.4638.74',
112 '96.0.4664.18',
113 '96.0.4664.45',
114 '96.0.4664.55',
115 '96.0.4664.93',
116 '97.0.4692.20',
f7a147e3
S
117 )
118 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
119
120
4390d5ec 121SUPPORTED_ENCODINGS = [
122 'gzip', 'deflate'
123]
9b8ee23b 124if brotli:
4390d5ec 125 SUPPORTED_ENCODINGS.append('br')
126
3e669f36 127std_headers = {
f7a147e3 128 'User-Agent': random_user_agent(),
59ae15a5 129 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 130 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 131 'Sec-Fetch-Mode': 'navigate',
3e669f36 132}
f427df17 133
5f6a1245 134
fb37eb25
S
135USER_AGENTS = {
136 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
137}
138
139
bf42a990 140NO_DEFAULT = object()
7b2c3f47 141IDENTITY = lambda x: x
bf42a990 142
7105440c
YCH
143ENGLISH_MONTH_NAMES = [
144 'January', 'February', 'March', 'April', 'May', 'June',
145 'July', 'August', 'September', 'October', 'November', 'December']
146
f6717dec
S
147MONTH_NAMES = {
148 'en': ENGLISH_MONTH_NAMES,
149 'fr': [
3e4185c3
S
150 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
151 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 152}
a942d6cb 153
8f53dc44 154# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
155TIMEZONE_NAMES = {
156 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
157 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
158 'EST': -5, 'EDT': -4, # Eastern
159 'CST': -6, 'CDT': -5, # Central
160 'MST': -7, 'MDT': -6, # Mountain
161 'PST': -8, 'PDT': -7 # Pacific
162}
163
c587cbb7 164# needed for sanitizing filenames in restricted mode
c8827027 165ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
166 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
167 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 168
46f59e89
S
169DATE_FORMATS = (
170 '%d %B %Y',
171 '%d %b %Y',
172 '%B %d %Y',
cb655f34
S
173 '%B %dst %Y',
174 '%B %dnd %Y',
9d30c213 175 '%B %drd %Y',
cb655f34 176 '%B %dth %Y',
46f59e89 177 '%b %d %Y',
cb655f34
S
178 '%b %dst %Y',
179 '%b %dnd %Y',
9d30c213 180 '%b %drd %Y',
cb655f34 181 '%b %dth %Y',
46f59e89
S
182 '%b %dst %Y %I:%M',
183 '%b %dnd %Y %I:%M',
9d30c213 184 '%b %drd %Y %I:%M',
46f59e89
S
185 '%b %dth %Y %I:%M',
186 '%Y %m %d',
187 '%Y-%m-%d',
bccdbd22 188 '%Y.%m.%d.',
46f59e89 189 '%Y/%m/%d',
81c13222 190 '%Y/%m/%d %H:%M',
46f59e89 191 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
192 '%Y%m%d%H%M',
193 '%Y%m%d%H%M%S',
4f3fa23e 194 '%Y%m%d',
0c1c6f4b 195 '%Y-%m-%d %H:%M',
46f59e89
S
196 '%Y-%m-%d %H:%M:%S',
197 '%Y-%m-%d %H:%M:%S.%f',
5014558a 198 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
199 '%d.%m.%Y %H:%M',
200 '%d.%m.%Y %H.%M',
201 '%Y-%m-%dT%H:%M:%SZ',
202 '%Y-%m-%dT%H:%M:%S.%fZ',
203 '%Y-%m-%dT%H:%M:%S.%f0Z',
204 '%Y-%m-%dT%H:%M:%S',
205 '%Y-%m-%dT%H:%M:%S.%f',
206 '%Y-%m-%dT%H:%M',
c6eed6b8
S
207 '%b %d %Y at %H:%M',
208 '%b %d %Y at %H:%M:%S',
b555ae9b
S
209 '%B %d %Y at %H:%M',
210 '%B %d %Y at %H:%M:%S',
a63d9bd0 211 '%H:%M %d-%b-%Y',
46f59e89
S
212)
213
214DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
215DATE_FORMATS_DAY_FIRST.extend([
216 '%d-%m-%Y',
217 '%d.%m.%Y',
218 '%d.%m.%y',
219 '%d/%m/%Y',
220 '%d/%m/%y',
221 '%d/%m/%Y %H:%M:%S',
47304e07 222 '%d-%m-%Y %H:%M',
46f59e89
S
223])
224
225DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
226DATE_FORMATS_MONTH_FIRST.extend([
227 '%m-%d-%Y',
228 '%m.%d.%Y',
229 '%m/%d/%Y',
230 '%m/%d/%y',
231 '%m/%d/%Y %H:%M:%S',
232])
233
06b3fe29 234PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 235JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 236
1d485a1a 237NUMBER_RE = r'\d+(?:\.\d+)?'
238
7105440c 239
0b9c08b4 240@functools.cache
d77c3dfd 241def preferredencoding():
59ae15a5 242 """Get preferred encoding.
d77c3dfd 243
59ae15a5
PH
244 Returns the best encoding scheme for the system, based on
245 locale.getpreferredencoding() and some further tweaks.
246 """
247 try:
248 pref = locale.getpreferredencoding()
28e614de 249 'TEST'.encode(pref)
70a1165b 250 except Exception:
59ae15a5 251 pref = 'UTF-8'
bae611f2 252
59ae15a5 253 return pref
d77c3dfd 254
f4bfd65f 255
181c8655 256def write_json_file(obj, fn):
1394646a 257 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 258
cfb0511d 259 tf = tempfile.NamedTemporaryFile(
260 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
261 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
262
263 try:
264 with tf:
45d86abe 265 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
266 if sys.platform == 'win32':
267 # Need to remove existing file on Windows, else os.rename raises
268 # WindowsError or FileExistsError.
19a03940 269 with contextlib.suppress(OSError):
1394646a 270 os.unlink(fn)
19a03940 271 with contextlib.suppress(OSError):
9cd5f54e
R
272 mask = os.umask(0)
273 os.umask(mask)
274 os.chmod(tf.name, 0o666 & ~mask)
181c8655 275 os.rename(tf.name, fn)
70a1165b 276 except Exception:
19a03940 277 with contextlib.suppress(OSError):
181c8655 278 os.remove(tf.name)
181c8655
PH
279 raise
280
281
cfb0511d 282def find_xpath_attr(node, xpath, key, val=None):
283 """ Find the xpath xpath[@key=val] """
284 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 285 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 286 return node.find(expr)
59ae56fa 287
d7e66d39
JMF
288# On python2.6 the xml.etree.ElementTree.Element methods don't support
289# the namespace parameter
5f6a1245
JW
290
291
d7e66d39
JMF
292def xpath_with_ns(path, ns_map):
293 components = [c.split(':') for c in path.split('/')]
294 replaced = []
295 for c in components:
296 if len(c) == 1:
297 replaced.append(c[0])
298 else:
299 ns, tag = c
300 replaced.append('{%s}%s' % (ns_map[ns], tag))
301 return '/'.join(replaced)
302
d77c3dfd 303
a41fb80c 304def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 305 def _find_xpath(xpath):
f9934b96 306 return node.find(xpath)
578c0745 307
14f25df2 308 if isinstance(xpath, str):
578c0745
S
309 n = _find_xpath(xpath)
310 else:
311 for xp in xpath:
312 n = _find_xpath(xp)
313 if n is not None:
314 break
d74bebd5 315
8e636da4 316 if n is None:
bf42a990
S
317 if default is not NO_DEFAULT:
318 return default
319 elif fatal:
bf0ff932
PH
320 name = xpath if name is None else name
321 raise ExtractorError('Could not find XML element %s' % name)
322 else:
323 return None
a41fb80c
S
324 return n
325
326
327def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
328 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
329 if n is None or n == default:
330 return n
331 if n.text is None:
332 if default is not NO_DEFAULT:
333 return default
334 elif fatal:
335 name = xpath if name is None else name
336 raise ExtractorError('Could not find XML element\'s text %s' % name)
337 else:
338 return None
339 return n.text
a41fb80c
S
340
341
342def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
343 n = find_xpath_attr(node, xpath, key)
344 if n is None:
345 if default is not NO_DEFAULT:
346 return default
347 elif fatal:
86e5f3ed 348 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
349 raise ExtractorError('Could not find XML attribute %s' % name)
350 else:
351 return None
352 return n.attrib[key]
bf0ff932
PH
353
354
c487cf00 355def get_element_by_id(id, html, **kwargs):
43e8fafd 356 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 357 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 358
12ea2f30 359
c487cf00 360def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 361 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 362 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
363
364
84c237fb 365def get_element_by_class(class_name, html):
2af12ad9
TC
366 """Return the content of the first tag with the specified class in the passed HTML document"""
367 retval = get_elements_by_class(class_name, html)
368 return retval[0] if retval else None
369
370
6f32a0b5
ZM
371def get_element_html_by_class(class_name, html):
372 """Return the html of the first tag with the specified class in the passed HTML document"""
373 retval = get_elements_html_by_class(class_name, html)
374 return retval[0] if retval else None
375
376
c487cf00 377def get_element_by_attribute(attribute, value, html, **kwargs):
378 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
379 return retval[0] if retval else None
380
381
c487cf00 382def get_element_html_by_attribute(attribute, value, html, **kargs):
383 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
384 return retval[0] if retval else None
385
386
c487cf00 387def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
388 """Return the content of all tags with the specified class in the passed HTML document as a list"""
389 return get_elements_by_attribute(
64fa820c 390 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
391 html, escape_value=False)
392
393
6f32a0b5
ZM
394def get_elements_html_by_class(class_name, html):
395 """Return the html of all tags with the specified class in the passed HTML document as a list"""
396 return get_elements_html_by_attribute(
64fa820c 397 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
398 html, escape_value=False)
399
400
401def get_elements_by_attribute(*args, **kwargs):
43e8fafd 402 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
403 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
404
405
406def get_elements_html_by_attribute(*args, **kwargs):
407 """Return the html of the tag with the specified attribute in the passed HTML document"""
408 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
4c9a1a3b 411def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
412 """
413 Return the text (content) and the html (whole) of the tag with the specified
414 attribute in the passed HTML document
415 """
9e6dd238 416
86e5f3ed 417 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 418
84c237fb
YCH
419 value = re.escape(value) if escape_value else value
420
86e5f3ed 421 partial_element_re = rf'''(?x)
4c9a1a3b 422 <(?P<tag>{tag})
0254f162 423 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 424 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
425 '''
38285056 426
0254f162
ZM
427 for m in re.finditer(partial_element_re, html):
428 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 429
0254f162
ZM
430 yield (
431 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
432 whole
433 )
a921f407 434
c5229f39 435
ac668111 436class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
437 """
438 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
439 closing tag for the first opening tag it has encountered, and can be used
440 as a context manager
441 """
442
443 class HTMLBreakOnClosingTagException(Exception):
444 pass
445
446 def __init__(self):
447 self.tagstack = collections.deque()
ac668111 448 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
449
450 def __enter__(self):
451 return self
452
453 def __exit__(self, *_):
454 self.close()
455
456 def close(self):
457 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
458 # so data remains buffered; we no longer have any interest in it, thus
459 # override this method to discard it
460 pass
461
462 def handle_starttag(self, tag, _):
463 self.tagstack.append(tag)
464
465 def handle_endtag(self, tag):
466 if not self.tagstack:
467 raise compat_HTMLParseError('no tags in the stack')
468 while self.tagstack:
469 inner_tag = self.tagstack.pop()
470 if inner_tag == tag:
471 break
472 else:
473 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
474 if not self.tagstack:
475 raise self.HTMLBreakOnClosingTagException()
476
477
478def get_element_text_and_html_by_tag(tag, html):
479 """
480 For the first element with the specified tag in the passed HTML document
481 return its' content (text) and the whole element (html)
482 """
483 def find_or_raise(haystack, needle, exc):
484 try:
485 return haystack.index(needle)
486 except ValueError:
487 raise exc
488 closing_tag = f'</{tag}>'
489 whole_start = find_or_raise(
490 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
491 content_start = find_or_raise(
492 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
493 content_start += whole_start + 1
494 with HTMLBreakOnClosingTagParser() as parser:
495 parser.feed(html[whole_start:content_start])
496 if not parser.tagstack or parser.tagstack[0] != tag:
497 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
498 offset = content_start
499 while offset < len(html):
500 next_closing_tag_start = find_or_raise(
501 html[offset:], closing_tag,
502 compat_HTMLParseError(f'closing {tag} tag not found'))
503 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
504 try:
505 parser.feed(html[offset:offset + next_closing_tag_end])
506 offset += next_closing_tag_end
507 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
508 return html[content_start:offset + next_closing_tag_start], \
509 html[whole_start:offset + next_closing_tag_end]
510 raise compat_HTMLParseError('unexpected end of html')
511
512
ac668111 513class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 514 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 515
8bb56eee 516 def __init__(self):
c5229f39 517 self.attrs = {}
ac668111 518 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
519
520 def handle_starttag(self, tag, attrs):
521 self.attrs = dict(attrs)
522
c5229f39 523
ac668111 524class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
525 """HTML parser to gather the attributes for the elements of a list"""
526
527 def __init__(self):
ac668111 528 html.parser.HTMLParser.__init__(self)
73673ccf
FF
529 self.items = []
530 self._level = 0
531
532 def handle_starttag(self, tag, attrs):
533 if tag == 'li' and self._level == 0:
534 self.items.append(dict(attrs))
535 self._level += 1
536
537 def handle_endtag(self, tag):
538 self._level -= 1
539
540
8bb56eee
BF
541def extract_attributes(html_element):
542 """Given a string for an HTML element such as
543 <el
544 a="foo" B="bar" c="&98;az" d=boz
545 empty= noval entity="&amp;"
546 sq='"' dq="'"
547 >
548 Decode and return a dictionary of attributes.
549 {
550 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
551 'empty': '', 'noval': None, 'entity': '&',
552 'sq': '"', 'dq': '\''
553 }.
8bb56eee
BF
554 """
555 parser = HTMLAttributeParser()
19a03940 556 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
557 parser.feed(html_element)
558 parser.close()
8bb56eee 559 return parser.attrs
9e6dd238 560
c5229f39 561
73673ccf
FF
562def parse_list(webpage):
563 """Given a string for an series of HTML <li> elements,
564 return a dictionary of their attributes"""
565 parser = HTMLListAttrsParser()
566 parser.feed(webpage)
567 parser.close()
568 return parser.items
569
570
9e6dd238 571def clean_html(html):
59ae15a5 572 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
573
574 if html is None: # Convenience for sanitizing descriptions etc.
575 return html
576
49185227 577 html = re.sub(r'\s+', ' ', html)
578 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
579 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
580 # Strip html tags
581 html = re.sub('<.*?>', '', html)
582 # Replace html entities
583 html = unescapeHTML(html)
7decf895 584 return html.strip()
9e6dd238
FV
585
586
b7c47b74 587class LenientJSONDecoder(json.JSONDecoder):
588 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
589 self.transform_source, self.ignore_extra = transform_source, ignore_extra
590 super().__init__(*args, **kwargs)
591
592 def decode(self, s):
593 if self.transform_source:
594 s = self.transform_source(s)
2fa669f7 595 try:
596 if self.ignore_extra:
597 return self.raw_decode(s.lstrip())[0]
598 return super().decode(s)
599 except json.JSONDecodeError as e:
600 if e.pos is not None:
601 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
602 raise
b7c47b74 603
604
d77c3dfd 605def sanitize_open(filename, open_mode):
59ae15a5
PH
606 """Try to open the given filename, and slightly tweak it if this fails.
607
608 Attempts to open the given filename. If this fails, it tries to change
609 the filename slightly, step by step, until it's either able to open it
610 or it fails and raises a final exception, like the standard open()
611 function.
612
613 It returns the tuple (stream, definitive_file_name).
614 """
0edb3e33 615 if filename == '-':
616 if sys.platform == 'win32':
617 import msvcrt
be5c1ae8 618
62b58c09 619 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 620 with contextlib.suppress(io.UnsupportedOperation):
621 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 622 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 623
0edb3e33 624 for attempt in range(2):
625 try:
626 try:
89737671 627 if sys.platform == 'win32':
b506289f 628 # FIXME: An exclusive lock also locks the file from being read.
629 # Since windows locks are mandatory, don't lock the file on windows (for now).
630 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 631 raise LockingUnsupportedError()
0edb3e33 632 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 633 except OSError:
0edb3e33 634 stream = open(filename, open_mode)
8a82af35 635 return stream, filename
86e5f3ed 636 except OSError as err:
0edb3e33 637 if attempt or err.errno in (errno.EACCES,):
638 raise
639 old_filename, filename = filename, sanitize_path(filename)
640 if old_filename == filename:
641 raise
d77c3dfd
FV
642
643
644def timeconvert(timestr):
59ae15a5
PH
645 """Convert RFC 2822 defined time string into system timestamp"""
646 timestamp = None
647 timetuple = email.utils.parsedate_tz(timestr)
648 if timetuple is not None:
649 timestamp = email.utils.mktime_tz(timetuple)
650 return timestamp
1c469a94 651
5f6a1245 652
5c3895ff 653def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 654 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 655 @param restricted Use a stricter subset of allowed characters
656 @param is_id Whether this is an ID that should be kept unchanged if possible.
657 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 658 """
5c3895ff 659 if s == '':
660 return ''
661
59ae15a5 662 def replace_insane(char):
c587cbb7
AT
663 if restricted and char in ACCENT_CHARS:
664 return ACCENT_CHARS[char]
91dd88b9 665 elif not restricted and char == '\n':
5c3895ff 666 return '\0 '
989a01c2 667 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
668 # Replace with their full-width unicode counterparts
669 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 670 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
671 return ''
672 elif char == '"':
673 return '' if restricted else '\''
674 elif char == ':':
5c3895ff 675 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 676 elif char in '\\/|*<>':
5c3895ff 677 return '\0_'
678 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
679 return '\0_'
59ae15a5
PH
680 return char
681
989a01c2 682 if restricted and is_id is NO_DEFAULT:
683 s = unicodedata.normalize('NFKC', s)
5c3895ff 684 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 685 result = ''.join(map(replace_insane, s))
5c3895ff 686 if is_id is NO_DEFAULT:
ae61d108 687 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
688 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 689 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
690 result = result.replace('\0', '') or '_'
691
796173d0
PH
692 if not is_id:
693 while '__' in result:
694 result = result.replace('__', '_')
695 result = result.strip('_')
696 # Common case of "Foreign band name - English song title"
697 if restricted and result.startswith('-_'):
698 result = result[2:]
5a42414b
PH
699 if result.startswith('-'):
700 result = '_' + result[len('-'):]
a7440261 701 result = result.lstrip('.')
796173d0
PH
702 if not result:
703 result = '_'
59ae15a5 704 return result
d77c3dfd 705
5f6a1245 706
c2934512 707def sanitize_path(s, force=False):
a2aaf4db 708 """Sanitizes and normalizes path on Windows"""
c2934512 709 if sys.platform == 'win32':
c4218ac3 710 force = False
c2934512 711 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 712 elif force:
713 drive_or_unc = ''
714 else:
a2aaf4db 715 return s
c2934512 716
be531ef1
S
717 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
718 if drive_or_unc:
a2aaf4db
S
719 norm_path.pop(0)
720 sanitized_path = [
ec85ded8 721 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 722 for path_part in norm_path]
be531ef1
S
723 if drive_or_unc:
724 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 725 elif force and s and s[0] == os.path.sep:
c4218ac3 726 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
727 return os.path.join(*sanitized_path)
728
729
8f97a15d 730def sanitize_url(url, *, scheme='http'):
befa4708
S
731 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
732 # the number of unwanted failures due to missing protocol
21633673 733 if url is None:
734 return
735 elif url.startswith('//'):
8f97a15d 736 return f'{scheme}:{url}'
befa4708
S
737 # Fix some common typos seen so far
738 COMMON_TYPOS = (
067aa17e 739 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
740 (r'^httpss://', r'https://'),
741 # https://bx1.be/lives/direct-tv/
742 (r'^rmtp([es]?)://', r'rtmp\1://'),
743 )
744 for mistake, fixup in COMMON_TYPOS:
745 if re.match(mistake, url):
746 return re.sub(mistake, fixup, url)
bc6b9bcd 747 return url
17bcc626
S
748
749
5435dcf9 750def extract_basic_auth(url):
14f25df2 751 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
752 if parts.username is None:
753 return url, None
14f25df2 754 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
755 parts.hostname if parts.port is None
756 else '%s:%d' % (parts.hostname, parts.port))))
757 auth_payload = base64.b64encode(
0f06bcd7 758 ('%s:%s' % (parts.username, parts.password or '')).encode())
759 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
760
761
67dda517 762def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 763 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
764 if auth_header is not None:
765 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
766 headers['Authorization'] = auth_header
ac668111 767 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
768
769
51098426 770def expand_path(s):
2fa669f7 771 """Expand shell variables and ~"""
51098426
S
772 return os.path.expandvars(compat_expanduser(s))
773
774
7e9a6125 775def orderedSet(iterable, *, lazy=False):
776 """Remove all duplicates from the input iterable"""
777 def _iter():
778 seen = [] # Do not use set since the items can be unhashable
779 for x in iterable:
780 if x not in seen:
781 seen.append(x)
782 yield x
783
784 return _iter() if lazy else list(_iter())
d77c3dfd 785
912b38b4 786
55b2f099 787def _htmlentity_transform(entity_with_semicolon):
4e408e47 788 """Transforms an HTML entity to a character."""
55b2f099
YCH
789 entity = entity_with_semicolon[:-1]
790
4e408e47 791 # Known non-numeric HTML entity
ac668111 792 if entity in html.entities.name2codepoint:
793 return chr(html.entities.name2codepoint[entity])
4e408e47 794
62b58c09
L
795 # TODO: HTML5 allows entities without a semicolon.
796 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 797 if entity_with_semicolon in html.entities.html5:
798 return html.entities.html5[entity_with_semicolon]
55b2f099 799
91757b0f 800 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
801 if mobj is not None:
802 numstr = mobj.group(1)
28e614de 803 if numstr.startswith('x'):
4e408e47 804 base = 16
28e614de 805 numstr = '0%s' % numstr
4e408e47
PH
806 else:
807 base = 10
067aa17e 808 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 809 with contextlib.suppress(ValueError):
ac668111 810 return chr(int(numstr, base))
4e408e47
PH
811
812 # Unknown entity in name, return its literal representation
7a3f0c00 813 return '&%s;' % entity
4e408e47
PH
814
815
d77c3dfd 816def unescapeHTML(s):
912b38b4
PH
817 if s is None:
818 return None
19a03940 819 assert isinstance(s, str)
d77c3dfd 820
4e408e47 821 return re.sub(
95f3f7c2 822 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 823
8bf48f23 824
cdb19aa4 825def escapeHTML(text):
826 return (
827 text
828 .replace('&', '&amp;')
829 .replace('<', '&lt;')
830 .replace('>', '&gt;')
831 .replace('"', '&quot;')
832 .replace("'", '&#39;')
833 )
834
835
f5b1bca9 836def process_communicate_or_kill(p, *args, **kwargs):
da4db748 837 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
838 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 839 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 840
841
d3c93ec2 842class Popen(subprocess.Popen):
843 if sys.platform == 'win32':
844 _startupinfo = subprocess.STARTUPINFO()
845 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
846 else:
847 _startupinfo = None
848
82ea226c
L
849 @staticmethod
850 def _fix_pyinstaller_ld_path(env):
851 """Restore LD_LIBRARY_PATH when using PyInstaller
852 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
853 https://github.com/yt-dlp/yt-dlp/issues/4573
854 """
855 if not hasattr(sys, '_MEIPASS'):
856 return
857
858 def _fix(key):
859 orig = env.get(f'{key}_ORIG')
860 if orig is None:
861 env.pop(key, None)
862 else:
863 env[key] = orig
864
865 _fix('LD_LIBRARY_PATH') # Linux
866 _fix('DYLD_LIBRARY_PATH') # macOS
867
868 def __init__(self, *args, env=None, text=False, **kwargs):
869 if env is None:
870 env = os.environ.copy()
871 self._fix_pyinstaller_ld_path(env)
872
f0c9fb96 873 if text is True:
874 kwargs['universal_newlines'] = True # For 3.6 compatibility
875 kwargs.setdefault('encoding', 'utf-8')
876 kwargs.setdefault('errors', 'replace')
82ea226c 877 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 878
879 def communicate_or_kill(self, *args, **kwargs):
8a82af35 880 try:
881 return self.communicate(*args, **kwargs)
882 except BaseException: # Including KeyboardInterrupt
f0c9fb96 883 self.kill(timeout=None)
8a82af35 884 raise
d3c93ec2 885
f0c9fb96 886 def kill(self, *, timeout=0):
887 super().kill()
888 if timeout != 0:
889 self.wait(timeout=timeout)
890
891 @classmethod
992dc6b4 892 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 893 with cls(*args, **kwargs) as proc:
914491b8 894 default = '' if proc.text_mode else b''
992dc6b4 895 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 896 return stdout or default, stderr or default, proc.returncode
f0c9fb96 897
d3c93ec2 898
aa49acd1
S
899def get_subprocess_encoding():
900 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
901 # For subprocess calls, encode with locale encoding
902 # Refer to http://stackoverflow.com/a/9951851/35070
903 encoding = preferredencoding()
904 else:
905 encoding = sys.getfilesystemencoding()
906 if encoding is None:
907 encoding = 'utf-8'
908 return encoding
909
910
8bf48f23 911def encodeFilename(s, for_subprocess=False):
19a03940 912 assert isinstance(s, str)
cfb0511d 913 return s
aa49acd1
S
914
915
916def decodeFilename(b, for_subprocess=False):
cfb0511d 917 return b
8bf48f23 918
f07b74fc
PH
919
920def encodeArgument(s):
cfb0511d 921 # Legacy code that uses byte strings
922 # Uncomment the following line after fixing all post processors
14f25df2 923 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 924 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
925
926
aa49acd1 927def decodeArgument(b):
cfb0511d 928 return b
aa49acd1
S
929
930
8271226a
PH
931def decodeOption(optval):
932 if optval is None:
933 return optval
934 if isinstance(optval, bytes):
935 optval = optval.decode(preferredencoding())
936
14f25df2 937 assert isinstance(optval, str)
8271226a 938 return optval
1c256f70 939
5f6a1245 940
aa7785f8 941_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
942
943
944def timetuple_from_msec(msec):
945 secs, msec = divmod(msec, 1000)
946 mins, secs = divmod(secs, 60)
947 hrs, mins = divmod(mins, 60)
948 return _timetuple(hrs, mins, secs, msec)
949
950
cdb19aa4 951def formatSeconds(secs, delim=':', msec=False):
aa7785f8 952 time = timetuple_from_msec(secs * 1000)
953 if time.hours:
954 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
955 elif time.minutes:
956 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 957 else:
aa7785f8 958 ret = '%d' % time.seconds
959 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 960
a0ddb8a2 961
77562778 962def _ssl_load_windows_store_certs(ssl_context, storename):
963 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
964 try:
965 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
966 if encoding == 'x509_asn' and (
967 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
968 except PermissionError:
969 return
970 for cert in certs:
19a03940 971 with contextlib.suppress(ssl.SSLError):
77562778 972 ssl_context.load_verify_locations(cadata=cert)
a2366922 973
77562778 974
975def make_HTTPS_handler(params, **kwargs):
976 opts_check_certificate = not params.get('nocheckcertificate')
977 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
978 context.check_hostname = opts_check_certificate
f81c62a6 979 if params.get('legacyserverconnect'):
980 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 981 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
982 context.set_ciphers('DEFAULT')
8a82af35 983
77562778 984 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
985 if opts_check_certificate:
d5820461 986 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
987 context.load_verify_locations(cafile=certifi.where())
168bbc4f 988 else:
989 try:
990 context.load_default_certs()
991 # Work around the issue in load_default_certs when there are bad certificates. See:
992 # https://github.com/yt-dlp/yt-dlp/issues/1060,
993 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
994 except ssl.SSLError:
995 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
996 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
997 for storename in ('CA', 'ROOT'):
998 _ssl_load_windows_store_certs(context, storename)
999 context.set_default_verify_paths()
8a82af35 1000
bb58c9ed 1001 client_certfile = params.get('client_certificate')
1002 if client_certfile:
1003 try:
1004 context.load_cert_chain(
1005 client_certfile, keyfile=params.get('client_certificate_key'),
1006 password=params.get('client_certificate_password'))
1007 except ssl.SSLError:
1008 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1009
1010 # Some servers may reject requests if ALPN extension is not sent. See:
1011 # https://github.com/python/cpython/issues/85140
1012 # https://github.com/yt-dlp/yt-dlp/issues/3878
1013 with contextlib.suppress(NotImplementedError):
1014 context.set_alpn_protocols(['http/1.1'])
1015
77562778 1016 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1017
732ea2f0 1018
5873d4cc 1019def bug_reports_message(before=';'):
57e0f077 1020 from .update import REPOSITORY
1021
1022 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1023 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1024
1025 before = before.rstrip()
1026 if not before or before.endswith(('.', '!', '?')):
1027 msg = msg[0].title() + msg[1:]
1028
1029 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1030
1031
bf5b9d85
PM
1032class YoutubeDLError(Exception):
1033 """Base exception for YoutubeDL errors."""
aa9369a2 1034 msg = None
1035
1036 def __init__(self, msg=None):
1037 if msg is not None:
1038 self.msg = msg
1039 elif self.msg is None:
1040 self.msg = type(self).__name__
1041 super().__init__(self.msg)
bf5b9d85
PM
1042
1043
ac668111 1044network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1045if hasattr(ssl, 'CertificateError'):
1046 network_exceptions.append(ssl.CertificateError)
1047network_exceptions = tuple(network_exceptions)
1048
1049
bf5b9d85 1050class ExtractorError(YoutubeDLError):
1c256f70 1051 """Error during info extraction."""
5f6a1245 1052
1151c407 1053 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1054 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1055 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1056 """
3158150c 1057 if sys.exc_info()[0] in network_exceptions:
9a82b238 1058 expected = True
d5979c5d 1059
7265a219 1060 self.orig_msg = str(msg)
1c256f70 1061 self.traceback = tb
1151c407 1062 self.expected = expected
2eabb802 1063 self.cause = cause
d11271dd 1064 self.video_id = video_id
1151c407 1065 self.ie = ie
1066 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1067 if isinstance(self.exc_info[1], ExtractorError):
1068 self.exc_info = self.exc_info[1].exc_info
1151c407 1069
86e5f3ed 1070 super().__init__(''.join((
a70635b8 1071 format_field(ie, None, '[%s] '),
1072 format_field(video_id, None, '%s: '),
7265a219 1073 msg,
a70635b8 1074 format_field(cause, None, ' (caused by %r)'),
1151c407 1075 '' if expected else bug_reports_message())))
1c256f70 1076
01951dda 1077 def format_traceback(self):
497d2fab 1078 return join_nonempty(
1079 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1080 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1081 delim='\n') or None
01951dda 1082
1c256f70 1083
416c7fcb
PH
1084class UnsupportedError(ExtractorError):
1085 def __init__(self, url):
86e5f3ed 1086 super().__init__(
416c7fcb
PH
1087 'Unsupported URL: %s' % url, expected=True)
1088 self.url = url
1089
1090
55b3e45b
JMF
1091class RegexNotFoundError(ExtractorError):
1092 """Error when a regex didn't match"""
1093 pass
1094
1095
773f291d
S
1096class GeoRestrictedError(ExtractorError):
1097 """Geographic restriction Error exception.
1098
1099 This exception may be thrown when a video is not available from your
1100 geographic location due to geographic restrictions imposed by a website.
1101 """
b6e0c7d2 1102
0db3bae8 1103 def __init__(self, msg, countries=None, **kwargs):
1104 kwargs['expected'] = True
86e5f3ed 1105 super().__init__(msg, **kwargs)
773f291d
S
1106 self.countries = countries
1107
1108
693f0600 1109class UserNotLive(ExtractorError):
1110 """Error when a channel/user is not live"""
1111
1112 def __init__(self, msg=None, **kwargs):
1113 kwargs['expected'] = True
1114 super().__init__(msg or 'The channel is not currently live', **kwargs)
1115
1116
bf5b9d85 1117class DownloadError(YoutubeDLError):
59ae15a5 1118 """Download Error exception.
d77c3dfd 1119
59ae15a5
PH
1120 This exception may be thrown by FileDownloader objects if they are not
1121 configured to continue on errors. They will contain the appropriate
1122 error message.
1123 """
5f6a1245 1124
8cc83b8d
FV
1125 def __init__(self, msg, exc_info=None):
1126 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1127 super().__init__(msg)
8cc83b8d 1128 self.exc_info = exc_info
d77c3dfd
FV
1129
1130
498f5606 1131class EntryNotInPlaylist(YoutubeDLError):
1132 """Entry not in playlist exception.
1133
1134 This exception will be thrown by YoutubeDL when a requested entry
1135 is not found in the playlist info_dict
1136 """
aa9369a2 1137 msg = 'Entry not found in info'
498f5606 1138
1139
bf5b9d85 1140class SameFileError(YoutubeDLError):
59ae15a5 1141 """Same File exception.
d77c3dfd 1142
59ae15a5
PH
1143 This exception will be thrown by FileDownloader objects if they detect
1144 multiple files would have to be downloaded to the same file on disk.
1145 """
aa9369a2 1146 msg = 'Fixed output name but more than one file to download'
1147
1148 def __init__(self, filename=None):
1149 if filename is not None:
1150 self.msg += f': {filename}'
1151 super().__init__(self.msg)
d77c3dfd
FV
1152
1153
bf5b9d85 1154class PostProcessingError(YoutubeDLError):
59ae15a5 1155 """Post Processing exception.
d77c3dfd 1156
59ae15a5
PH
1157 This exception may be raised by PostProcessor's .run() method to
1158 indicate an error in the postprocessing task.
1159 """
5f6a1245 1160
5f6a1245 1161
48f79687 1162class DownloadCancelled(YoutubeDLError):
1163 """ Exception raised when the download queue should be interrupted """
1164 msg = 'The download was cancelled'
8b0d7497 1165
8b0d7497 1166
48f79687 1167class ExistingVideoReached(DownloadCancelled):
1168 """ --break-on-existing triggered """
1169 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1170
48f79687 1171
1172class RejectedVideoReached(DownloadCancelled):
1173 """ --break-on-reject triggered """
1174 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1175
1176
48f79687 1177class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1178 """ --max-downloads limit has been reached. """
48f79687 1179 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1180
1181
f2ebc5c7 1182class ReExtractInfo(YoutubeDLError):
1183 """ Video info needs to be re-extracted. """
1184
1185 def __init__(self, msg, expected=False):
1186 super().__init__(msg)
1187 self.expected = expected
1188
1189
1190class ThrottledDownload(ReExtractInfo):
48f79687 1191 """ Download speed below --throttled-rate. """
aa9369a2 1192 msg = 'The download speed is below throttle limit'
d77c3dfd 1193
43b22906 1194 def __init__(self):
1195 super().__init__(self.msg, expected=False)
f2ebc5c7 1196
d77c3dfd 1197
bf5b9d85 1198class UnavailableVideoError(YoutubeDLError):
59ae15a5 1199 """Unavailable Format exception.
d77c3dfd 1200
59ae15a5
PH
1201 This exception will be thrown when a video is requested
1202 in a format that is not available for that video.
1203 """
aa9369a2 1204 msg = 'Unable to download video'
1205
1206 def __init__(self, err=None):
1207 if err is not None:
1208 self.msg += f': {err}'
1209 super().__init__(self.msg)
d77c3dfd
FV
1210
1211
bf5b9d85 1212class ContentTooShortError(YoutubeDLError):
59ae15a5 1213 """Content Too Short exception.
d77c3dfd 1214
59ae15a5
PH
1215 This exception may be raised by FileDownloader objects when a file they
1216 download is too small for what the server announced first, indicating
1217 the connection was probably interrupted.
1218 """
d77c3dfd 1219
59ae15a5 1220 def __init__(self, downloaded, expected):
86e5f3ed 1221 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1222 # Both in bytes
59ae15a5
PH
1223 self.downloaded = downloaded
1224 self.expected = expected
d77c3dfd 1225
5f6a1245 1226
bf5b9d85 1227class XAttrMetadataError(YoutubeDLError):
efa97bdc 1228 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1229 super().__init__(msg)
efa97bdc 1230 self.code = code
bd264412 1231 self.msg = msg
efa97bdc
YCH
1232
1233 # Parsing code and msg
3089bc74 1234 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1235 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1236 self.reason = 'NO_SPACE'
1237 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1238 self.reason = 'VALUE_TOO_LONG'
1239 else:
1240 self.reason = 'NOT_SUPPORTED'
1241
1242
bf5b9d85 1243class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1244 pass
1245
1246
c5a59d93 1247def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1248 hc = http_class(*args, **kwargs)
be4a824d 1249 source_address = ydl_handler._params.get('source_address')
8959018a 1250
be4a824d 1251 if source_address is not None:
8959018a
AU
1252 # This is to workaround _create_connection() from socket where it will try all
1253 # address data from getaddrinfo() including IPv6. This filters the result from
1254 # getaddrinfo() based on the source_address value.
1255 # This is based on the cpython socket.create_connection() function.
1256 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1257 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1258 host, port = address
1259 err = None
1260 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1261 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1262 ip_addrs = [addr for addr in addrs if addr[0] == af]
1263 if addrs and not ip_addrs:
1264 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1265 raise OSError(
9e21e6d9
S
1266 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1267 % (ip_version, source_address[0]))
8959018a
AU
1268 for res in ip_addrs:
1269 af, socktype, proto, canonname, sa = res
1270 sock = None
1271 try:
1272 sock = socket.socket(af, socktype, proto)
1273 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1274 sock.settimeout(timeout)
1275 sock.bind(source_address)
1276 sock.connect(sa)
1277 err = None # Explicitly break reference cycle
1278 return sock
86e5f3ed 1279 except OSError as _:
8959018a
AU
1280 err = _
1281 if sock is not None:
1282 sock.close()
1283 if err is not None:
1284 raise err
1285 else:
86e5f3ed 1286 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1287 if hasattr(hc, '_create_connection'):
1288 hc._create_connection = _create_connection
cfb0511d 1289 hc.source_address = (source_address, 0)
be4a824d
PH
1290
1291 return hc
1292
1293
87f0e62d 1294def handle_youtubedl_headers(headers):
992fc9d6
YCH
1295 filtered_headers = headers
1296
1297 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1298 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1299 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1300
992fc9d6 1301 return filtered_headers
87f0e62d
YCH
1302
1303
ac668111 1304class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1305 """Handler for HTTP requests and responses.
1306
1307 This class, when installed with an OpenerDirector, automatically adds
1308 the standard headers to every HTTP request and handles gzipped and
1309 deflated responses from web servers. If compression is to be avoided in
1310 a particular request, the original request in the program code only has
0424ec30 1311 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1312 removed before making the real request.
1313
1314 Part of this code was copied from:
1315
1316 http://techknack.net/python-urllib2-handlers/
1317
1318 Andrew Rowls, the author of that code, agreed to release it to the
1319 public domain.
1320 """
1321
be4a824d 1322 def __init__(self, params, *args, **kwargs):
ac668111 1323 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1324 self._params = params
1325
1326 def http_open(self, req):
ac668111 1327 conn_class = http.client.HTTPConnection
71aff188
YCH
1328
1329 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1330 if socks_proxy:
1331 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1332 del req.headers['Ytdl-socks-proxy']
1333
be4a824d 1334 return self.do_open(functools.partial(
71aff188 1335 _create_http_connection, self, conn_class, False),
be4a824d
PH
1336 req)
1337
59ae15a5
PH
1338 @staticmethod
1339 def deflate(data):
fc2119f2 1340 if not data:
1341 return data
59ae15a5
PH
1342 try:
1343 return zlib.decompress(data, -zlib.MAX_WBITS)
1344 except zlib.error:
1345 return zlib.decompress(data)
1346
4390d5ec 1347 @staticmethod
1348 def brotli(data):
1349 if not data:
1350 return data
9b8ee23b 1351 return brotli.decompress(data)
4390d5ec 1352
acebc9cd 1353 def http_request(self, req):
51f267d9
S
1354 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1355 # always respected by websites, some tend to give out URLs with non percent-encoded
1356 # non-ASCII characters (see telemb.py, ard.py [#3412])
1357 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1358 # To work around aforementioned issue we will replace request's original URL with
1359 # percent-encoded one
1360 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1361 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1362 url = req.get_full_url()
1363 url_escaped = escape_url(url)
1364
1365 # Substitute URL if any change after escaping
1366 if url != url_escaped:
15d260eb 1367 req = update_Request(req, url=url_escaped)
51f267d9 1368
8b7539d2 1369 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1370 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1371 # The dict keys are capitalized because of this bug by urllib
1372 if h.capitalize() not in req.headers:
33ac271b 1373 req.add_header(h, v)
87f0e62d 1374
af14914b 1375 if 'Accept-encoding' not in req.headers:
1376 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1377
87f0e62d 1378 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1379
379a4f16 1380 return super().do_request_(req)
59ae15a5 1381
acebc9cd 1382 def http_response(self, req, resp):
59ae15a5
PH
1383 old_resp = resp
1384 # gzip
1385 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1386 content = resp.read()
1387 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1388 try:
1389 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1390 except OSError as original_ioerror:
aa3e9507
PH
1391 # There may be junk add the end of the file
1392 # See http://stackoverflow.com/q/4928560/35070 for details
1393 for i in range(1, 1024):
1394 try:
1395 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1396 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1397 except OSError:
aa3e9507
PH
1398 continue
1399 break
1400 else:
1401 raise original_ioerror
ac668111 1402 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1403 resp.msg = old_resp.msg
c047270c 1404 del resp.headers['Content-encoding']
59ae15a5
PH
1405 # deflate
1406 if resp.headers.get('Content-encoding', '') == 'deflate':
1407 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1408 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1409 resp.msg = old_resp.msg
c047270c 1410 del resp.headers['Content-encoding']
4390d5ec 1411 # brotli
1412 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1413 resp = urllib.request.addinfourl(
4390d5ec 1414 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1415 resp.msg = old_resp.msg
1416 del resp.headers['Content-encoding']
ad729172 1417 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1418 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1419 if 300 <= resp.code < 400:
1420 location = resp.headers.get('Location')
1421 if location:
1422 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1423 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1424 location_escaped = escape_url(location)
1425 if location != location_escaped:
1426 del resp.headers['Location']
1427 resp.headers['Location'] = location_escaped
59ae15a5 1428 return resp
0f8d03f8 1429
acebc9cd
PH
1430 https_request = http_request
1431 https_response = http_response
bf50b038 1432
5de90176 1433
71aff188
YCH
1434def make_socks_conn_class(base_class, socks_proxy):
1435 assert issubclass(base_class, (
ac668111 1436 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1437
14f25df2 1438 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1439 if url_components.scheme.lower() == 'socks5':
1440 socks_type = ProxyType.SOCKS5
1441 elif url_components.scheme.lower() in ('socks', 'socks4'):
1442 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1443 elif url_components.scheme.lower() == 'socks4a':
1444 socks_type = ProxyType.SOCKS4A
71aff188 1445
cdd94c2e
YCH
1446 def unquote_if_non_empty(s):
1447 if not s:
1448 return s
ac668111 1449 return urllib.parse.unquote_plus(s)
cdd94c2e 1450
71aff188
YCH
1451 proxy_args = (
1452 socks_type,
1453 url_components.hostname, url_components.port or 1080,
1454 True, # Remote DNS
cdd94c2e
YCH
1455 unquote_if_non_empty(url_components.username),
1456 unquote_if_non_empty(url_components.password),
71aff188
YCH
1457 )
1458
1459 class SocksConnection(base_class):
1460 def connect(self):
1461 self.sock = sockssocket()
1462 self.sock.setproxy(*proxy_args)
19a03940 1463 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1464 self.sock.settimeout(self.timeout)
1465 self.sock.connect((self.host, self.port))
1466
ac668111 1467 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1468 if hasattr(self, '_context'): # Python > 2.6
1469 self.sock = self._context.wrap_socket(
1470 self.sock, server_hostname=self.host)
1471 else:
1472 self.sock = ssl.wrap_socket(self.sock)
1473
1474 return SocksConnection
1475
1476
ac668111 1477class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1478 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1479 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1480 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1481 self._params = params
1482
1483 def https_open(self, req):
4f264c02 1484 kwargs = {}
71aff188
YCH
1485 conn_class = self._https_conn_class
1486
4f264c02
JMF
1487 if hasattr(self, '_context'): # python > 2.6
1488 kwargs['context'] = self._context
1489 if hasattr(self, '_check_hostname'): # python 3.x
1490 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1491
1492 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1493 if socks_proxy:
1494 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1495 del req.headers['Ytdl-socks-proxy']
1496
4f28b537 1497 try:
1498 return self.do_open(
1499 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1500 except urllib.error.URLError as e:
1501 if (isinstance(e.reason, ssl.SSLError)
1502 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1503 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1504 raise
be4a824d
PH
1505
1506
941e881e 1507def is_path_like(f):
1508 return isinstance(f, (str, bytes, os.PathLike))
1509
1510
ac668111 1511class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1512 """
1513 See [1] for cookie file format.
1514
1515 1. https://curl.haxx.se/docs/http-cookies.html
1516 """
e7e62441 1517 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1518 _ENTRY_LEN = 7
1519 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1520# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1521
1522'''
1523 _CookieFileEntry = collections.namedtuple(
1524 'CookieFileEntry',
1525 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1526
d76fa1f3 1527 def __init__(self, filename=None, *args, **kwargs):
1528 super().__init__(None, *args, **kwargs)
941e881e 1529 if is_path_like(filename):
d76fa1f3 1530 filename = os.fspath(filename)
1531 self.filename = filename
1532
24146491 1533 @staticmethod
1534 def _true_or_false(cndn):
1535 return 'TRUE' if cndn else 'FALSE'
1536
d76fa1f3 1537 @contextlib.contextmanager
1538 def open(self, file, *, write=False):
941e881e 1539 if is_path_like(file):
d76fa1f3 1540 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1541 yield f
1542 else:
1543 if write:
1544 file.truncate(0)
1545 yield file
1546
24146491 1547 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1548 now = time.time()
1549 for cookie in self:
1550 if (not ignore_discard and cookie.discard
1551 or not ignore_expires and cookie.is_expired(now)):
1552 continue
1553 name, value = cookie.name, cookie.value
1554 if value is None:
1555 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1556 # with no name, whereas http.cookiejar regards it as a
1557 # cookie with no value.
1558 name, value = '', name
1559 f.write('%s\n' % '\t'.join((
1560 cookie.domain,
1561 self._true_or_false(cookie.domain.startswith('.')),
1562 cookie.path,
1563 self._true_or_false(cookie.secure),
1564 str_or_none(cookie.expires, default=''),
1565 name, value
1566 )))
1567
1568 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1569 """
1570 Save cookies to a file.
24146491 1571 Code is taken from CPython 3.6
1572 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1573
c380cc28
S
1574 if filename is None:
1575 if self.filename is not None:
1576 filename = self.filename
1577 else:
ac668111 1578 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1579
24146491 1580 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1581 for cookie in self:
1582 if cookie.expires is None:
1583 cookie.expires = 0
c380cc28 1584
d76fa1f3 1585 with self.open(filename, write=True) as f:
c380cc28 1586 f.write(self._HEADER)
24146491 1587 self._really_save(f, *args, **kwargs)
1bab3437
S
1588
1589 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1590 """Load cookies from a file."""
1591 if filename is None:
1592 if self.filename is not None:
1593 filename = self.filename
1594 else:
ac668111 1595 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1596
c380cc28
S
1597 def prepare_line(line):
1598 if line.startswith(self._HTTPONLY_PREFIX):
1599 line = line[len(self._HTTPONLY_PREFIX):]
1600 # comments and empty lines are fine
1601 if line.startswith('#') or not line.strip():
1602 return line
1603 cookie_list = line.split('\t')
1604 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1605 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1606 cookie = self._CookieFileEntry(*cookie_list)
1607 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1608 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1609 return line
1610
e7e62441 1611 cf = io.StringIO()
d76fa1f3 1612 with self.open(filename) as f:
e7e62441 1613 for line in f:
c380cc28
S
1614 try:
1615 cf.write(prepare_line(line))
ac668111 1616 except http.cookiejar.LoadError as e:
94aa0644 1617 if f'{line.strip()} '[0] in '[{"':
ac668111 1618 raise http.cookiejar.LoadError(
94aa0644 1619 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1620 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1621 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1622 continue
e7e62441 1623 cf.seek(0)
1624 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1625 # Session cookies are denoted by either `expires` field set to
1626 # an empty string or 0. MozillaCookieJar only recognizes the former
1627 # (see [1]). So we need force the latter to be recognized as session
1628 # cookies on our own.
1629 # Session cookies may be important for cookies-based authentication,
1630 # e.g. usually, when user does not check 'Remember me' check box while
1631 # logging in on a site, some important cookies are stored as session
1632 # cookies so that not recognizing them will result in failed login.
1633 # 1. https://bugs.python.org/issue17164
1634 for cookie in self:
1635 # Treat `expires=0` cookies as session cookies
1636 if cookie.expires == 0:
1637 cookie.expires = None
1638 cookie.discard = True
1639
1640
ac668111 1641class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1642 def __init__(self, cookiejar=None):
ac668111 1643 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1644
1645 def http_response(self, request, response):
ac668111 1646 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1647
ac668111 1648 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1649 https_response = http_response
1650
1651
ac668111 1652class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1653 """YoutubeDL redirect handler
1654
1655 The code is based on HTTPRedirectHandler implementation from CPython [1].
1656
1657 This redirect handler solves two issues:
1658 - ensures redirect URL is always unicode under python 2
1659 - introduces support for experimental HTTP response status code
1660 308 Permanent Redirect [2] used by some sites [3]
1661
1662 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1663 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1664 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1665 """
1666
ac668111 1667 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1668
1669 def redirect_request(self, req, fp, code, msg, headers, newurl):
1670 """Return a Request or None in response to a redirect.
1671
1672 This is called by the http_error_30x methods when a
1673 redirection response is received. If a redirection should
1674 take place, return a new Request to allow http_error_30x to
1675 perform the redirect. Otherwise, raise HTTPError if no-one
1676 else should try to handle this url. Return None if you can't
1677 but another Handler might.
1678 """
1679 m = req.get_method()
1680 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1681 or code in (301, 302, 303) and m == "POST")):
14f25df2 1682 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1683 # Strictly (according to RFC 2616), 301 or 302 in response to
1684 # a POST MUST NOT cause a redirection without confirmation
1685 # from the user (of urllib.request, in this case). In practice,
1686 # essentially all clients do redirect in this case, so we do
1687 # the same.
1688
201c1459 1689 # Be conciliant with URIs containing a space. This is mainly
1690 # redundant with the more complete encoding done in http_error_302(),
1691 # but it is kept for compatibility with other callers.
1692 newurl = newurl.replace(' ', '%20')
1693
1694 CONTENT_HEADERS = ("content-length", "content-type")
1695 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1696 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1697
1698 # A 303 must either use GET or HEAD for subsequent request
1699 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1700 if code == 303 and m != 'HEAD':
1701 m = 'GET'
1702 # 301 and 302 redirects are commonly turned into a GET from a POST
1703 # for subsequent requests by browsers, so we'll do the same.
1704 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1705 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1706 if code in (301, 302) and m == 'POST':
1707 m = 'GET'
1708
ac668111 1709 return urllib.request.Request(
201c1459 1710 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1711 unverifiable=True, method=m)
fca6dba8
S
1712
1713
46f59e89
S
1714def extract_timezone(date_str):
1715 m = re.search(
f137e4c2 1716 r'''(?x)
1717 ^.{8,}? # >=8 char non-TZ prefix, if present
1718 (?P<tz>Z| # just the UTC Z, or
1719 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1720 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1721 [ ]? # optional space
1722 (?P<sign>\+|-) # +/-
1723 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1724 $)
1725 ''', date_str)
46f59e89 1726 if not m:
8f53dc44 1727 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1728 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1729 if timezone is not None:
1730 date_str = date_str[:-len(m.group('tz'))]
1731 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1732 else:
1733 date_str = date_str[:-len(m.group('tz'))]
1734 if not m.group('sign'):
1735 timezone = datetime.timedelta()
1736 else:
1737 sign = 1 if m.group('sign') == '+' else -1
1738 timezone = datetime.timedelta(
1739 hours=sign * int(m.group('hours')),
1740 minutes=sign * int(m.group('minutes')))
1741 return timezone, date_str
1742
1743
08b38d54 1744def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1745 """ Return a UNIX timestamp from the given date """
1746
1747 if date_str is None:
1748 return None
1749
52c3a6e4
S
1750 date_str = re.sub(r'\.[0-9]+', '', date_str)
1751
08b38d54 1752 if timezone is None:
46f59e89
S
1753 timezone, date_str = extract_timezone(date_str)
1754
19a03940 1755 with contextlib.suppress(ValueError):
86e5f3ed 1756 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1757 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1758 return calendar.timegm(dt.timetuple())
912b38b4
PH
1759
1760
46f59e89
S
1761def date_formats(day_first=True):
1762 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1763
1764
42bdd9d0 1765def unified_strdate(date_str, day_first=True):
bf50b038 1766 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1767
1768 if date_str is None:
1769 return None
bf50b038 1770 upload_date = None
5f6a1245 1771 # Replace commas
026fcc04 1772 date_str = date_str.replace(',', ' ')
42bdd9d0 1773 # Remove AM/PM + timezone
9bb8e0a3 1774 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1775 _, date_str = extract_timezone(date_str)
42bdd9d0 1776
46f59e89 1777 for expression in date_formats(day_first):
19a03940 1778 with contextlib.suppress(ValueError):
bf50b038 1779 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1780 if upload_date is None:
1781 timetuple = email.utils.parsedate_tz(date_str)
1782 if timetuple:
19a03940 1783 with contextlib.suppress(ValueError):
c6b9cf05 1784 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1785 if upload_date is not None:
14f25df2 1786 return str(upload_date)
bf50b038 1787
5f6a1245 1788
46f59e89
S
1789def unified_timestamp(date_str, day_first=True):
1790 if date_str is None:
1791 return None
1792
8f53dc44 1793 date_str = re.sub(r'\s+', ' ', re.sub(
1794 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1795
7dc2a74e 1796 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1797 timezone, date_str = extract_timezone(date_str)
1798
1799 # Remove AM/PM + timezone
1800 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1801
deef3195
S
1802 # Remove unrecognized timezones from ISO 8601 alike timestamps
1803 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1804 if m:
1805 date_str = date_str[:-len(m.group('tz'))]
1806
f226880c
PH
1807 # Python only supports microseconds, so remove nanoseconds
1808 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1809 if m:
1810 date_str = m.group(1)
1811
46f59e89 1812 for expression in date_formats(day_first):
19a03940 1813 with contextlib.suppress(ValueError):
7dc2a74e 1814 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1815 return calendar.timegm(dt.timetuple())
8f53dc44 1816
46f59e89
S
1817 timetuple = email.utils.parsedate_tz(date_str)
1818 if timetuple:
8f53dc44 1819 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1820
1821
28e614de 1822def determine_ext(url, default_ext='unknown_video'):
85750f89 1823 if url is None or '.' not in url:
f4776371 1824 return default_ext
9cb9a5df 1825 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1826 if re.match(r'^[A-Za-z0-9]+$', guess):
1827 return guess
a7aaa398
S
1828 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1829 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1830 return guess.rstrip('/')
73e79f2a 1831 else:
cbdbb766 1832 return default_ext
73e79f2a 1833
5f6a1245 1834
824fa511
S
1835def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1836 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1837
5f6a1245 1838
9e62f283 1839def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1840 R"""
1841 Return a datetime object from a string.
1842 Supported format:
1843 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1844
1845 @param format strftime format of DATE
1846 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1847 auto: round to the unit provided in date_str (if applicable).
9e62f283 1848 """
1849 auto_precision = False
1850 if precision == 'auto':
1851 auto_precision = True
1852 precision = 'microsecond'
396a76f7 1853 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1854 if date_str in ('now', 'today'):
37254abc 1855 return today
f8795e10
PH
1856 if date_str == 'yesterday':
1857 return today - datetime.timedelta(days=1)
9e62f283 1858 match = re.match(
3d38b2d6 1859 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1860 date_str)
37254abc 1861 if match is not None:
9e62f283 1862 start_time = datetime_from_str(match.group('start'), precision, format)
1863 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1864 unit = match.group('unit')
9e62f283 1865 if unit == 'month' or unit == 'year':
1866 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1867 unit = 'day'
9e62f283 1868 else:
1869 if unit == 'week':
1870 unit = 'day'
1871 time *= 7
1872 delta = datetime.timedelta(**{unit + 's': time})
1873 new_date = start_time + delta
1874 if auto_precision:
1875 return datetime_round(new_date, unit)
1876 return new_date
1877
1878 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1879
1880
d49f8db3 1881def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1882 R"""
1883 Return a date object from a string using datetime_from_str
9e62f283 1884
3d38b2d6 1885 @param strict Restrict allowed patterns to "YYYYMMDD" and
1886 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1887 """
3d38b2d6 1888 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1889 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1890 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1891
1892
1893def datetime_add_months(dt, months):
1894 """Increment/Decrement a datetime object by months."""
1895 month = dt.month + months - 1
1896 year = dt.year + month // 12
1897 month = month % 12 + 1
1898 day = min(dt.day, calendar.monthrange(year, month)[1])
1899 return dt.replace(year, month, day)
1900
1901
1902def datetime_round(dt, precision='day'):
1903 """
1904 Round a datetime object's time to a specific precision
1905 """
1906 if precision == 'microsecond':
1907 return dt
1908
1909 unit_seconds = {
1910 'day': 86400,
1911 'hour': 3600,
1912 'minute': 60,
1913 'second': 1,
1914 }
1915 roundto = lambda x, n: ((x + n / 2) // n) * n
1916 timestamp = calendar.timegm(dt.timetuple())
1917 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1918
1919
e63fc1be 1920def hyphenate_date(date_str):
1921 """
1922 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1923 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1924 if match is not None:
1925 return '-'.join(match.groups())
1926 else:
1927 return date_str
1928
5f6a1245 1929
86e5f3ed 1930class DateRange:
bd558525 1931 """Represents a time interval between two dates"""
5f6a1245 1932
bd558525
JMF
1933 def __init__(self, start=None, end=None):
1934 """start and end must be strings in the format accepted by date"""
1935 if start is not None:
d49f8db3 1936 self.start = date_from_str(start, strict=True)
bd558525
JMF
1937 else:
1938 self.start = datetime.datetime.min.date()
1939 if end is not None:
d49f8db3 1940 self.end = date_from_str(end, strict=True)
bd558525
JMF
1941 else:
1942 self.end = datetime.datetime.max.date()
37254abc 1943 if self.start > self.end:
bd558525 1944 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1945
bd558525
JMF
1946 @classmethod
1947 def day(cls, day):
1948 """Returns a range that only contains the given day"""
5f6a1245
JW
1949 return cls(day, day)
1950
bd558525
JMF
1951 def __contains__(self, date):
1952 """Check if the date is in the range"""
37254abc
JMF
1953 if not isinstance(date, datetime.date):
1954 date = date_from_str(date)
1955 return self.start <= date <= self.end
5f6a1245 1956
bd558525 1957 def __str__(self):
86e5f3ed 1958 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1959
f2df4071 1960 def __eq__(self, other):
1961 return (isinstance(other, DateRange)
1962 and self.start == other.start and self.end == other.end)
1963
c496ca96
PH
1964
1965def platform_name():
14f25df2 1966 """ Returns the platform name as a str """
da4db748 1967 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
b1f94422 1968 return platform.platform()
c496ca96 1969
b1f94422 1970
1971@functools.cache
1972def system_identifier():
1973 python_implementation = platform.python_implementation()
1974 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1975 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1976 libc_ver = []
1977 with contextlib.suppress(OSError): # We may not have access to the executable
1978 libc_ver = platform.libc_ver()
b1f94422 1979
1980 return 'Python %s (%s %s) - %s %s' % (
1981 platform.python_version(),
1982 python_implementation,
1983 platform.architecture()[0],
1984 platform.platform(),
dab284f8 1985 format_field(join_nonempty(*libc_ver, delim=' '), None, '(%s)'),
b1f94422 1986 )
c257baff
PH
1987
1988
0b9c08b4 1989@functools.cache
49fa4d9a 1990def get_windows_version():
8a82af35 1991 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1992 if compat_os_name == 'nt':
1993 return version_tuple(platform.win32_ver()[1])
1994 else:
8a82af35 1995 return ()
49fa4d9a
N
1996
1997
734f90bb 1998def write_string(s, out=None, encoding=None):
19a03940 1999 assert isinstance(s, str)
2000 out = out or sys.stderr
7459e3a2 2001
fe1daad3 2002 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 2003 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 2004
8a82af35 2005 enc, buffer = None, out
cfb0511d 2006 if 'b' in getattr(out, 'mode', ''):
c487cf00 2007 enc = encoding or preferredencoding()
104aa738 2008 elif hasattr(out, 'buffer'):
8a82af35 2009 buffer = out.buffer
104aa738 2010 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2011
8a82af35 2012 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2013 out.flush()
2014
2015
da4db748 2016def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2017 from . import _IN_CLI
2018 if _IN_CLI:
2019 if msg in deprecation_warning._cache:
2020 return
2021 deprecation_warning._cache.add(msg)
2022 if printer:
2023 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2024 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2025 else:
2026 import warnings
2027 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2028
2029
2030deprecation_warning._cache = set()
2031
2032
48ea9cea
PH
2033def bytes_to_intlist(bs):
2034 if not bs:
2035 return []
2036 if isinstance(bs[0], int): # Python 3
2037 return list(bs)
2038 else:
2039 return [ord(c) for c in bs]
2040
c257baff 2041
cba892fa 2042def intlist_to_bytes(xs):
2043 if not xs:
2044 return b''
ac668111 2045 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2046
2047
8a82af35 2048class LockingUnsupportedError(OSError):
1890fc63 2049 msg = 'File locking is not supported'
0edb3e33 2050
2051 def __init__(self):
2052 super().__init__(self.msg)
2053
2054
c1c9a79c
PH
2055# Cross-platform file locking
2056if sys.platform == 'win32':
fe0918bb 2057 import ctypes
c1c9a79c
PH
2058 import ctypes.wintypes
2059 import msvcrt
2060
2061 class OVERLAPPED(ctypes.Structure):
2062 _fields_ = [
2063 ('Internal', ctypes.wintypes.LPVOID),
2064 ('InternalHigh', ctypes.wintypes.LPVOID),
2065 ('Offset', ctypes.wintypes.DWORD),
2066 ('OffsetHigh', ctypes.wintypes.DWORD),
2067 ('hEvent', ctypes.wintypes.HANDLE),
2068 ]
2069
2070 kernel32 = ctypes.windll.kernel32
2071 LockFileEx = kernel32.LockFileEx
2072 LockFileEx.argtypes = [
2073 ctypes.wintypes.HANDLE, # hFile
2074 ctypes.wintypes.DWORD, # dwFlags
2075 ctypes.wintypes.DWORD, # dwReserved
2076 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2077 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2078 ctypes.POINTER(OVERLAPPED) # Overlapped
2079 ]
2080 LockFileEx.restype = ctypes.wintypes.BOOL
2081 UnlockFileEx = kernel32.UnlockFileEx
2082 UnlockFileEx.argtypes = [
2083 ctypes.wintypes.HANDLE, # hFile
2084 ctypes.wintypes.DWORD, # dwReserved
2085 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2086 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2087 ctypes.POINTER(OVERLAPPED) # Overlapped
2088 ]
2089 UnlockFileEx.restype = ctypes.wintypes.BOOL
2090 whole_low = 0xffffffff
2091 whole_high = 0x7fffffff
2092
747c0bd1 2093 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2094 overlapped = OVERLAPPED()
2095 overlapped.Offset = 0
2096 overlapped.OffsetHigh = 0
2097 overlapped.hEvent = 0
2098 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2099
2100 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2101 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2102 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2103 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2104 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2105
2106 def _unlock_file(f):
2107 assert f._lock_file_overlapped_p
2108 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2109 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2110 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2111
2112else:
399a76e6
YCH
2113 try:
2114 import fcntl
c1c9a79c 2115
a3125791 2116 def _lock_file(f, exclusive, block):
b63837bc 2117 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2118 if not block:
2119 flags |= fcntl.LOCK_NB
acea8d7c 2120 try:
b63837bc 2121 fcntl.flock(f, flags)
acea8d7c
JK
2122 except BlockingIOError:
2123 raise
2124 except OSError: # AOSP does not have flock()
b63837bc 2125 fcntl.lockf(f, flags)
c1c9a79c 2126
399a76e6 2127 def _unlock_file(f):
acea8d7c
JK
2128 try:
2129 fcntl.flock(f, fcntl.LOCK_UN)
2130 except OSError:
2131 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2132
399a76e6 2133 except ImportError:
399a76e6 2134
a3125791 2135 def _lock_file(f, exclusive, block):
0edb3e33 2136 raise LockingUnsupportedError()
399a76e6
YCH
2137
2138 def _unlock_file(f):
0edb3e33 2139 raise LockingUnsupportedError()
c1c9a79c
PH
2140
2141
86e5f3ed 2142class locked_file:
0edb3e33 2143 locked = False
747c0bd1 2144
a3125791 2145 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2146 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2147 raise NotImplementedError(mode)
2148 self.mode, self.block = mode, block
2149
2150 writable = any(f in mode for f in 'wax+')
2151 readable = any(f in mode for f in 'r+')
2152 flags = functools.reduce(operator.ior, (
2153 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2154 getattr(os, 'O_BINARY', 0), # Windows only
2155 getattr(os, 'O_NOINHERIT', 0), # Windows only
2156 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2157 os.O_APPEND if 'a' in mode else 0,
2158 os.O_EXCL if 'x' in mode else 0,
2159 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2160 ))
2161
98804d03 2162 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2163
2164 def __enter__(self):
a3125791 2165 exclusive = 'r' not in self.mode
c1c9a79c 2166 try:
a3125791 2167 _lock_file(self.f, exclusive, self.block)
0edb3e33 2168 self.locked = True
86e5f3ed 2169 except OSError:
c1c9a79c
PH
2170 self.f.close()
2171 raise
fcfa8853 2172 if 'w' in self.mode:
131e14dc
JK
2173 try:
2174 self.f.truncate()
2175 except OSError as e:
1890fc63 2176 if e.errno not in (
2177 errno.ESPIPE, # Illegal seek - expected for FIFO
2178 errno.EINVAL, # Invalid argument - expected for /dev/null
2179 ):
2180 raise
c1c9a79c
PH
2181 return self
2182
0edb3e33 2183 def unlock(self):
2184 if not self.locked:
2185 return
c1c9a79c 2186 try:
0edb3e33 2187 _unlock_file(self.f)
c1c9a79c 2188 finally:
0edb3e33 2189 self.locked = False
c1c9a79c 2190
0edb3e33 2191 def __exit__(self, *_):
2192 try:
2193 self.unlock()
2194 finally:
2195 self.f.close()
4eb7f1d1 2196
0edb3e33 2197 open = __enter__
2198 close = __exit__
a3125791 2199
0edb3e33 2200 def __getattr__(self, attr):
2201 return getattr(self.f, attr)
a3125791 2202
0edb3e33 2203 def __iter__(self):
2204 return iter(self.f)
a3125791 2205
4eb7f1d1 2206
0b9c08b4 2207@functools.cache
4644ac55
S
2208def get_filesystem_encoding():
2209 encoding = sys.getfilesystemencoding()
2210 return encoding if encoding is not None else 'utf-8'
2211
2212
4eb7f1d1 2213def shell_quote(args):
a6a173c2 2214 quoted_args = []
4644ac55 2215 encoding = get_filesystem_encoding()
a6a173c2
JMF
2216 for a in args:
2217 if isinstance(a, bytes):
2218 # We may get a filename encoded with 'encodeFilename'
2219 a = a.decode(encoding)
aefce8e6 2220 quoted_args.append(compat_shlex_quote(a))
28e614de 2221 return ' '.join(quoted_args)
9d4660ca
PH
2222
2223
2224def smuggle_url(url, data):
2225 """ Pass additional data in a URL for internal use. """
2226
81953d1a
RA
2227 url, idata = unsmuggle_url(url, {})
2228 data.update(idata)
14f25df2 2229 sdata = urllib.parse.urlencode(
28e614de
PH
2230 {'__youtubedl_smuggle': json.dumps(data)})
2231 return url + '#' + sdata
9d4660ca
PH
2232
2233
79f82953 2234def unsmuggle_url(smug_url, default=None):
83e865a3 2235 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2236 return smug_url, default
28e614de 2237 url, _, sdata = smug_url.rpartition('#')
14f25df2 2238 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2239 data = json.loads(jsond)
2240 return url, data
02dbf93f
PH
2241
2242
e0fd9573 2243def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2244 """ Formats numbers with decimal sufixes like K, M, etc """
2245 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2246 if num is None or num < 0:
e0fd9573 2247 return None
eeb2a770 2248 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2249 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2250 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2251 if factor == 1024:
2252 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2253 converted = num / (factor ** exponent)
abbeeebc 2254 return fmt % (converted, suffix)
e0fd9573 2255
2256
02dbf93f 2257def format_bytes(bytes):
f02d24d8 2258 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2259
1c088fa8 2260
fb47597b
S
2261def lookup_unit_table(unit_table, s):
2262 units_re = '|'.join(re.escape(u) for u in unit_table)
2263 m = re.match(
782b1b5b 2264 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2265 if not m:
2266 return None
2267 num_str = m.group('num').replace(',', '.')
2268 mult = unit_table[m.group('unit')]
2269 return int(float(num_str) * mult)
2270
2271
be64b5b0
PH
2272def parse_filesize(s):
2273 if s is None:
2274 return None
2275
dfb1b146 2276 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2277 # but we support those too
2278 _UNIT_TABLE = {
2279 'B': 1,
2280 'b': 1,
70852b47 2281 'bytes': 1,
be64b5b0
PH
2282 'KiB': 1024,
2283 'KB': 1000,
2284 'kB': 1024,
2285 'Kb': 1000,
13585d76 2286 'kb': 1000,
70852b47
YCH
2287 'kilobytes': 1000,
2288 'kibibytes': 1024,
be64b5b0
PH
2289 'MiB': 1024 ** 2,
2290 'MB': 1000 ** 2,
2291 'mB': 1024 ** 2,
2292 'Mb': 1000 ** 2,
13585d76 2293 'mb': 1000 ** 2,
70852b47
YCH
2294 'megabytes': 1000 ** 2,
2295 'mebibytes': 1024 ** 2,
be64b5b0
PH
2296 'GiB': 1024 ** 3,
2297 'GB': 1000 ** 3,
2298 'gB': 1024 ** 3,
2299 'Gb': 1000 ** 3,
13585d76 2300 'gb': 1000 ** 3,
70852b47
YCH
2301 'gigabytes': 1000 ** 3,
2302 'gibibytes': 1024 ** 3,
be64b5b0
PH
2303 'TiB': 1024 ** 4,
2304 'TB': 1000 ** 4,
2305 'tB': 1024 ** 4,
2306 'Tb': 1000 ** 4,
13585d76 2307 'tb': 1000 ** 4,
70852b47
YCH
2308 'terabytes': 1000 ** 4,
2309 'tebibytes': 1024 ** 4,
be64b5b0
PH
2310 'PiB': 1024 ** 5,
2311 'PB': 1000 ** 5,
2312 'pB': 1024 ** 5,
2313 'Pb': 1000 ** 5,
13585d76 2314 'pb': 1000 ** 5,
70852b47
YCH
2315 'petabytes': 1000 ** 5,
2316 'pebibytes': 1024 ** 5,
be64b5b0
PH
2317 'EiB': 1024 ** 6,
2318 'EB': 1000 ** 6,
2319 'eB': 1024 ** 6,
2320 'Eb': 1000 ** 6,
13585d76 2321 'eb': 1000 ** 6,
70852b47
YCH
2322 'exabytes': 1000 ** 6,
2323 'exbibytes': 1024 ** 6,
be64b5b0
PH
2324 'ZiB': 1024 ** 7,
2325 'ZB': 1000 ** 7,
2326 'zB': 1024 ** 7,
2327 'Zb': 1000 ** 7,
13585d76 2328 'zb': 1000 ** 7,
70852b47
YCH
2329 'zettabytes': 1000 ** 7,
2330 'zebibytes': 1024 ** 7,
be64b5b0
PH
2331 'YiB': 1024 ** 8,
2332 'YB': 1000 ** 8,
2333 'yB': 1024 ** 8,
2334 'Yb': 1000 ** 8,
13585d76 2335 'yb': 1000 ** 8,
70852b47
YCH
2336 'yottabytes': 1000 ** 8,
2337 'yobibytes': 1024 ** 8,
be64b5b0
PH
2338 }
2339
fb47597b
S
2340 return lookup_unit_table(_UNIT_TABLE, s)
2341
2342
2343def parse_count(s):
2344 if s is None:
be64b5b0
PH
2345 return None
2346
352d5da8 2347 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2348
2349 if re.match(r'^[\d,.]+$', s):
2350 return str_to_int(s)
2351
2352 _UNIT_TABLE = {
2353 'k': 1000,
2354 'K': 1000,
2355 'm': 1000 ** 2,
2356 'M': 1000 ** 2,
2357 'kk': 1000 ** 2,
2358 'KK': 1000 ** 2,
352d5da8 2359 'b': 1000 ** 3,
2360 'B': 1000 ** 3,
fb47597b 2361 }
be64b5b0 2362
352d5da8 2363 ret = lookup_unit_table(_UNIT_TABLE, s)
2364 if ret is not None:
2365 return ret
2366
2367 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2368 if mobj:
2369 return str_to_int(mobj.group(1))
be64b5b0 2370
2f7ae819 2371
5d45484c 2372def parse_resolution(s, *, lenient=False):
b871d7e9
S
2373 if s is None:
2374 return {}
2375
5d45484c
LNO
2376 if lenient:
2377 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2378 else:
2379 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2380 if mobj:
2381 return {
2382 'width': int(mobj.group('w')),
2383 'height': int(mobj.group('h')),
2384 }
2385
17ec8bcf 2386 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2387 if mobj:
2388 return {'height': int(mobj.group(1))}
2389
2390 mobj = re.search(r'\b([48])[kK]\b', s)
2391 if mobj:
2392 return {'height': int(mobj.group(1)) * 540}
2393
2394 return {}
2395
2396
0dc41787 2397def parse_bitrate(s):
14f25df2 2398 if not isinstance(s, str):
0dc41787
S
2399 return
2400 mobj = re.search(r'\b(\d+)\s*kbps', s)
2401 if mobj:
2402 return int(mobj.group(1))
2403
2404
a942d6cb 2405def month_by_name(name, lang='en'):
caefb1de
PH
2406 """ Return the number of a month by (locale-independently) English name """
2407
f6717dec 2408 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2409
caefb1de 2410 try:
f6717dec 2411 return month_names.index(name) + 1
7105440c
YCH
2412 except ValueError:
2413 return None
2414
2415
2416def month_by_abbreviation(abbrev):
2417 """ Return the number of a month by (locale-independently) English
2418 abbreviations """
2419
2420 try:
2421 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2422 except ValueError:
2423 return None
18258362
JMF
2424
2425
5aafe895 2426def fix_xml_ampersands(xml_str):
18258362 2427 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2428 return re.sub(
2429 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2430 '&amp;',
5aafe895 2431 xml_str)
e3946f98
PH
2432
2433
2434def setproctitle(title):
14f25df2 2435 assert isinstance(title, str)
c1c05c67 2436
fe0918bb 2437 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2438 try:
2439 import ctypes
2440 except ImportError:
c1c05c67
YCH
2441 return
2442
e3946f98 2443 try:
611c1dd9 2444 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2445 except OSError:
2446 return
2f49bcd6
RC
2447 except TypeError:
2448 # LoadLibrary in Windows Python 2.7.13 only expects
2449 # a bytestring, but since unicode_literals turns
2450 # every string into a unicode string, it fails.
2451 return
0f06bcd7 2452 title_bytes = title.encode()
6eefe533
PH
2453 buf = ctypes.create_string_buffer(len(title_bytes))
2454 buf.value = title_bytes
e3946f98 2455 try:
6eefe533 2456 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2457 except AttributeError:
2458 return # Strange libc, just skip this
d7dda168
PH
2459
2460
2461def remove_start(s, start):
46bc9b7d 2462 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2463
2464
2b9faf55 2465def remove_end(s, end):
46bc9b7d 2466 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2467
2468
31b2051e
S
2469def remove_quotes(s):
2470 if s is None or len(s) < 2:
2471 return s
2472 for quote in ('"', "'", ):
2473 if s[0] == quote and s[-1] == quote:
2474 return s[1:-1]
2475 return s
2476
2477
b6e0c7d2 2478def get_domain(url):
ebf99aaf 2479 """
2480 This implementation is inconsistent, but is kept for compatibility.
2481 Use this only for "webpage_url_domain"
2482 """
2483 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2484
2485
29eb5174 2486def url_basename(url):
14f25df2 2487 path = urllib.parse.urlparse(url).path
28e614de 2488 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2489
2490
02dc0a36 2491def base_url(url):
7657ec7e 2492 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2493
2494
e34c3361 2495def urljoin(base, path):
4b5de77b 2496 if isinstance(path, bytes):
0f06bcd7 2497 path = path.decode()
14f25df2 2498 if not isinstance(path, str) or not path:
e34c3361 2499 return None
fad4ceb5 2500 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2501 return path
4b5de77b 2502 if isinstance(base, bytes):
0f06bcd7 2503 base = base.decode()
14f25df2 2504 if not isinstance(base, str) or not re.match(
4b5de77b 2505 r'^(?:https?:)?//', base):
e34c3361 2506 return None
14f25df2 2507 return urllib.parse.urljoin(base, path)
e34c3361
S
2508
2509
ac668111 2510class HEADRequest(urllib.request.Request):
aa94a6d3 2511 def get_method(self):
611c1dd9 2512 return 'HEAD'
7217e148
PH
2513
2514
ac668111 2515class PUTRequest(urllib.request.Request):
95cf60e8
S
2516 def get_method(self):
2517 return 'PUT'
2518
2519
9732d77e 2520def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2521 if get_attr and v is not None:
2522 v = getattr(v, get_attr, None)
1812afb7
S
2523 try:
2524 return int(v) * invscale // scale
31c49255 2525 except (ValueError, TypeError, OverflowError):
af98f8ff 2526 return default
9732d77e 2527
9572013d 2528
40a90862 2529def str_or_none(v, default=None):
14f25df2 2530 return default if v is None else str(v)
40a90862 2531
9732d77e
PH
2532
2533def str_to_int(int_str):
48d4681e 2534 """ A more relaxed version of int_or_none """
f9934b96 2535 if isinstance(int_str, int):
348c6bf1 2536 return int_str
14f25df2 2537 elif isinstance(int_str, str):
42db58ec
S
2538 int_str = re.sub(r'[,\.\+]', '', int_str)
2539 return int_or_none(int_str)
608d11f5
PH
2540
2541
9732d77e 2542def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2543 if v is None:
2544 return default
2545 try:
2546 return float(v) * invscale / scale
5e1271c5 2547 except (ValueError, TypeError):
caf80631 2548 return default
43f775e4
PH
2549
2550
c7e327c4
S
2551def bool_or_none(v, default=None):
2552 return v if isinstance(v, bool) else default
2553
2554
53cd37ba 2555def strip_or_none(v, default=None):
14f25df2 2556 return v.strip() if isinstance(v, str) else default
b72b4431
S
2557
2558
af03000a 2559def url_or_none(url):
14f25df2 2560 if not url or not isinstance(url, str):
af03000a
S
2561 return None
2562 url = url.strip()
29f7c58a 2563 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2564
2565
3e9b66d7 2566def request_to_url(req):
ac668111 2567 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2568 return req.get_full_url()
2569 else:
2570 return req
2571
2572
e29663c6 2573def strftime_or_none(timestamp, date_format, default=None):
2574 datetime_object = None
2575 try:
f9934b96 2576 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2577 # Using naive datetime here can break timestamp() in Windows
2578 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2579 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
14f25df2 2580 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2581 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2582 date_format = re.sub( # Support %s on windows
2583 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2584 return datetime_object.strftime(date_format)
2585 except (ValueError, TypeError, AttributeError):
2586 return default
2587
2588
608d11f5 2589def parse_duration(s):
f9934b96 2590 if not isinstance(s, str):
608d11f5 2591 return None
ca7b3246 2592 s = s.strip()
38d79fd1 2593 if not s:
2594 return None
ca7b3246 2595
acaff495 2596 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2597 m = re.match(r'''(?x)
2598 (?P<before_secs>
2599 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2600 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2601 (?P<ms>[.:][0-9]+)?Z?$
2602 ''', s)
acaff495 2603 if m:
8bd1c00b 2604 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2605 else:
2606 m = re.match(
056653bb
S
2607 r'''(?ix)(?:P?
2608 (?:
1c1b2f96 2609 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2610 )?
2611 (?:
1c1b2f96 2612 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2613 )?
2614 (?:
1c1b2f96 2615 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2616 )?
8f4b58d7 2617 (?:
1c1b2f96 2618 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2619 )?
056653bb 2620 T)?
acaff495 2621 (?:
1c1b2f96 2622 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2623 )?
2624 (?:
1c1b2f96 2625 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2626 )?
2627 (?:
2628 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2629 )?Z?$''', s)
acaff495 2630 if m:
2631 days, hours, mins, secs, ms = m.groups()
2632 else:
15846398 2633 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2634 if m:
2635 hours, mins = m.groups()
2636 else:
2637 return None
2638
acaff495 2639 if ms:
19a03940 2640 ms = ms.replace(':', '.')
2641 return sum(float(part or 0) * mult for part, mult in (
2642 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2643
2644
e65e4c88 2645def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2646 name, real_ext = os.path.splitext(filename)
e65e4c88 2647 return (
86e5f3ed 2648 f'{name}.{ext}{real_ext}'
e65e4c88 2649 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2650 else f'{filename}.{ext}')
d70ad093
PH
2651
2652
b3ed15b7
S
2653def replace_extension(filename, ext, expected_real_ext=None):
2654 name, real_ext = os.path.splitext(filename)
86e5f3ed 2655 return '{}.{}'.format(
b3ed15b7
S
2656 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2657 ext)
2658
2659
d70ad093
PH
2660def check_executable(exe, args=[]):
2661 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2662 args can be a list of arguments for a short output (like -version) """
2663 try:
f0c9fb96 2664 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2665 except OSError:
2666 return False
2667 return exe
b7ab0590
PH
2668
2669
8a7f68d0 2670def _get_exe_version_output(exe, args, *, to_screen=None):
2671 if to_screen:
2672 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2673 try:
b64d04c1 2674 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2675 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2676 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2677 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2678 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2679 except OSError:
2680 return False
f0c9fb96 2681 return stdout
cae97f65
PH
2682
2683
2684def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2685 assert isinstance(output, str)
cae97f65
PH
2686 if version_re is None:
2687 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2688 m = re.search(version_re, output)
95807118
PH
2689 if m:
2690 return m.group(1)
2691 else:
2692 return unrecognized
2693
2694
9af98e17 2695def get_exe_version(exe, args=['--version'],
2696 version_re=None, unrecognized='present'):
2697 """ Returns the version of the specified executable,
2698 or False if the executable is not present """
2699 out = _get_exe_version_output(exe, args)
2700 return detect_exe_version(out, version_re, unrecognized) if out else False
2701
2702
7e88d7d7 2703def frange(start=0, stop=None, step=1):
2704 """Float range"""
2705 if stop is None:
2706 start, stop = 0, start
2707 sign = [-1, 1][step > 0] if step else 0
2708 while sign * start < sign * stop:
2709 yield start
2710 start += step
2711
2712
cb89cfc1 2713class LazyList(collections.abc.Sequence):
0f06bcd7 2714 """Lazy immutable list from an iterable
2715 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2716
8e5fecc8 2717 class IndexError(IndexError):
2718 pass
2719
282f5709 2720 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2721 self._iterable = iter(iterable)
2722 self._cache = [] if _cache is None else _cache
2723 self._reversed = reverse
483336e7 2724
2725 def __iter__(self):
0f06bcd7 2726 if self._reversed:
28419ca2 2727 # We need to consume the entire iterable to iterate in reverse
981052c9 2728 yield from self.exhaust()
28419ca2 2729 return
0f06bcd7 2730 yield from self._cache
2731 for item in self._iterable:
2732 self._cache.append(item)
483336e7 2733 yield item
2734
0f06bcd7 2735 def _exhaust(self):
2736 self._cache.extend(self._iterable)
2737 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2738 return self._cache
28419ca2 2739
981052c9 2740 def exhaust(self):
0f06bcd7 2741 """Evaluate the entire iterable"""
2742 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2743
28419ca2 2744 @staticmethod
0f06bcd7 2745 def _reverse_index(x):
f2df4071 2746 return None if x is None else ~x
483336e7 2747
2748 def __getitem__(self, idx):
2749 if isinstance(idx, slice):
0f06bcd7 2750 if self._reversed:
2751 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2752 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2753 elif isinstance(idx, int):
0f06bcd7 2754 if self._reversed:
2755 idx = self._reverse_index(idx)
e0f2b4b4 2756 start, stop, step = idx, idx, 0
483336e7 2757 else:
2758 raise TypeError('indices must be integers or slices')
e0f2b4b4 2759 if ((start or 0) < 0 or (stop or 0) < 0
2760 or (start is None and step < 0)
2761 or (stop is None and step > 0)):
483336e7 2762 # We need to consume the entire iterable to be able to slice from the end
2763 # Obviously, never use this with infinite iterables
0f06bcd7 2764 self._exhaust()
8e5fecc8 2765 try:
0f06bcd7 2766 return self._cache[idx]
8e5fecc8 2767 except IndexError as e:
2768 raise self.IndexError(e) from e
0f06bcd7 2769 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2770 if n > 0:
0f06bcd7 2771 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2772 try:
0f06bcd7 2773 return self._cache[idx]
8e5fecc8 2774 except IndexError as e:
2775 raise self.IndexError(e) from e
483336e7 2776
2777 def __bool__(self):
2778 try:
0f06bcd7 2779 self[-1] if self._reversed else self[0]
8e5fecc8 2780 except self.IndexError:
483336e7 2781 return False
2782 return True
2783
2784 def __len__(self):
0f06bcd7 2785 self._exhaust()
2786 return len(self._cache)
483336e7 2787
282f5709 2788 def __reversed__(self):
0f06bcd7 2789 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2790
2791 def __copy__(self):
0f06bcd7 2792 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2793
28419ca2 2794 def __repr__(self):
2795 # repr and str should mimic a list. So we exhaust the iterable
2796 return repr(self.exhaust())
2797
2798 def __str__(self):
2799 return repr(self.exhaust())
2800
483336e7 2801
7be9ccff 2802class PagedList:
c07a39ae 2803
2804 class IndexError(IndexError):
2805 pass
2806
dd26ced1
PH
2807 def __len__(self):
2808 # This is only useful for tests
2809 return len(self.getslice())
2810
7be9ccff 2811 def __init__(self, pagefunc, pagesize, use_cache=True):
2812 self._pagefunc = pagefunc
2813 self._pagesize = pagesize
f1d13090 2814 self._pagecount = float('inf')
7be9ccff 2815 self._use_cache = use_cache
2816 self._cache = {}
2817
2818 def getpage(self, pagenum):
d8cf8d97 2819 page_results = self._cache.get(pagenum)
2820 if page_results is None:
f1d13090 2821 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2822 if self._use_cache:
2823 self._cache[pagenum] = page_results
2824 return page_results
2825
2826 def getslice(self, start=0, end=None):
2827 return list(self._getslice(start, end))
2828
2829 def _getslice(self, start, end):
55575225 2830 raise NotImplementedError('This method must be implemented by subclasses')
2831
2832 def __getitem__(self, idx):
f1d13090 2833 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2834 if not isinstance(idx, int) or idx < 0:
2835 raise TypeError('indices must be non-negative integers')
2836 entries = self.getslice(idx, idx + 1)
d8cf8d97 2837 if not entries:
c07a39ae 2838 raise self.IndexError()
d8cf8d97 2839 return entries[0]
55575225 2840
9c44d242
PH
2841
2842class OnDemandPagedList(PagedList):
a44ca5a4 2843 """Download pages until a page with less than maximum results"""
86e5f3ed 2844
7be9ccff 2845 def _getslice(self, start, end):
b7ab0590
PH
2846 for pagenum in itertools.count(start // self._pagesize):
2847 firstid = pagenum * self._pagesize
2848 nextfirstid = pagenum * self._pagesize + self._pagesize
2849 if start >= nextfirstid:
2850 continue
2851
b7ab0590
PH
2852 startv = (
2853 start % self._pagesize
2854 if firstid <= start < nextfirstid
2855 else 0)
b7ab0590
PH
2856 endv = (
2857 ((end - 1) % self._pagesize) + 1
2858 if (end is not None and firstid <= end <= nextfirstid)
2859 else None)
2860
f1d13090 2861 try:
2862 page_results = self.getpage(pagenum)
2863 except Exception:
2864 self._pagecount = pagenum - 1
2865 raise
b7ab0590
PH
2866 if startv != 0 or endv is not None:
2867 page_results = page_results[startv:endv]
7be9ccff 2868 yield from page_results
b7ab0590
PH
2869
2870 # A little optimization - if current page is not "full", ie. does
2871 # not contain page_size videos then we can assume that this page
2872 # is the last one - there are no more ids on further pages -
2873 # i.e. no need to query again.
2874 if len(page_results) + startv < self._pagesize:
2875 break
2876
2877 # If we got the whole page, but the next page is not interesting,
2878 # break out early as well
2879 if end == nextfirstid:
2880 break
81c2f20b
PH
2881
2882
9c44d242 2883class InAdvancePagedList(PagedList):
a44ca5a4 2884 """PagedList with total number of pages known in advance"""
86e5f3ed 2885
9c44d242 2886 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2887 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2888 self._pagecount = pagecount
9c44d242 2889
7be9ccff 2890 def _getslice(self, start, end):
9c44d242 2891 start_page = start // self._pagesize
d37707bd 2892 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2893 skip_elems = start - start_page * self._pagesize
2894 only_more = None if end is None else end - start
2895 for pagenum in range(start_page, end_page):
7be9ccff 2896 page_results = self.getpage(pagenum)
9c44d242 2897 if skip_elems:
7be9ccff 2898 page_results = page_results[skip_elems:]
9c44d242
PH
2899 skip_elems = None
2900 if only_more is not None:
7be9ccff 2901 if len(page_results) < only_more:
2902 only_more -= len(page_results)
9c44d242 2903 else:
7be9ccff 2904 yield from page_results[:only_more]
9c44d242 2905 break
7be9ccff 2906 yield from page_results
9c44d242
PH
2907
2908
7e88d7d7 2909class PlaylistEntries:
2910 MissingEntry = object()
2911 is_exhausted = False
2912
2913 def __init__(self, ydl, info_dict):
7e9a6125 2914 self.ydl = ydl
2915
2916 # _entries must be assigned now since infodict can change during iteration
2917 entries = info_dict.get('entries')
2918 if entries is None:
2919 raise EntryNotInPlaylist('There are no entries')
2920 elif isinstance(entries, list):
2921 self.is_exhausted = True
2922
2923 requested_entries = info_dict.get('requested_entries')
2924 self.is_incomplete = bool(requested_entries)
2925 if self.is_incomplete:
2926 assert self.is_exhausted
2927 self._entries = [self.MissingEntry] * max(requested_entries)
2928 for i, entry in zip(requested_entries, entries):
2929 self._entries[i - 1] = entry
2930 elif isinstance(entries, (list, PagedList, LazyList)):
2931 self._entries = entries
2932 else:
2933 self._entries = LazyList(entries)
7e88d7d7 2934
2935 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2936 (?P<start>[+-]?\d+)?
2937 (?P<range>[:-]
2938 (?P<end>[+-]?\d+|inf(?:inite)?)?
2939 (?::(?P<step>[+-]?\d+))?
2940 )?''')
2941
2942 @classmethod
2943 def parse_playlist_items(cls, string):
2944 for segment in string.split(','):
2945 if not segment:
2946 raise ValueError('There is two or more consecutive commas')
2947 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2948 if not mobj:
2949 raise ValueError(f'{segment!r} is not a valid specification')
2950 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2951 if int_or_none(step) == 0:
2952 raise ValueError(f'Step in {segment!r} cannot be zero')
2953 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2954
2955 def get_requested_items(self):
2956 playlist_items = self.ydl.params.get('playlist_items')
2957 playlist_start = self.ydl.params.get('playliststart', 1)
2958 playlist_end = self.ydl.params.get('playlistend')
2959 # For backwards compatibility, interpret -1 as whole list
2960 if playlist_end in (-1, None):
2961 playlist_end = ''
2962 if not playlist_items:
2963 playlist_items = f'{playlist_start}:{playlist_end}'
2964 elif playlist_start != 1 or playlist_end:
2965 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2966
2967 for index in self.parse_playlist_items(playlist_items):
2968 for i, entry in self[index]:
2969 yield i, entry
1ac4fd80 2970 if not entry:
2971 continue
7e88d7d7 2972 try:
2973 # TODO: Add auto-generated fields
2974 self.ydl._match_entry(entry, incomplete=True, silent=True)
2975 except (ExistingVideoReached, RejectedVideoReached):
2976 return
2977
7e9a6125 2978 def get_full_count(self):
2979 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2980 return len(self)
2981 elif isinstance(self._entries, InAdvancePagedList):
2982 if self._entries._pagesize == 1:
2983 return self._entries._pagecount
2984
7e88d7d7 2985 @functools.cached_property
2986 def _getter(self):
2987 if isinstance(self._entries, list):
2988 def get_entry(i):
2989 try:
2990 entry = self._entries[i]
2991 except IndexError:
2992 entry = self.MissingEntry
2993 if not self.is_incomplete:
2994 raise self.IndexError()
2995 if entry is self.MissingEntry:
2996 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2997 return entry
2998 else:
2999 def get_entry(i):
3000 try:
3001 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3002 except (LazyList.IndexError, PagedList.IndexError):
3003 raise self.IndexError()
3004 return get_entry
3005
3006 def __getitem__(self, idx):
3007 if isinstance(idx, int):
3008 idx = slice(idx, idx)
3009
3010 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3011 step = 1 if idx.step is None else idx.step
3012 if idx.start is None:
3013 start = 0 if step > 0 else len(self) - 1
3014 else:
3015 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3016
3017 # NB: Do not call len(self) when idx == [:]
3018 if idx.stop is None:
3019 stop = 0 if step < 0 else float('inf')
3020 else:
3021 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3022 stop += [-1, 1][step > 0]
3023
3024 for i in frange(start, stop, step):
3025 if i < 0:
3026 continue
3027 try:
7e9a6125 3028 entry = self._getter(i)
3029 except self.IndexError:
3030 self.is_exhausted = True
3031 if step > 0:
7e88d7d7 3032 break
7e9a6125 3033 continue
7e88d7d7 3034 yield i + 1, entry
3035
3036 def __len__(self):
3037 return len(tuple(self[:]))
3038
3039 class IndexError(IndexError):
3040 pass
3041
3042
81c2f20b 3043def uppercase_escape(s):
676eb3f2 3044 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3045 return re.sub(
a612753d 3046 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3047 lambda m: unicode_escape(m.group(0))[0],
3048 s)
0fe2ff78
YCH
3049
3050
3051def lowercase_escape(s):
3052 unicode_escape = codecs.getdecoder('unicode_escape')
3053 return re.sub(
3054 r'\\u[0-9a-fA-F]{4}',
3055 lambda m: unicode_escape(m.group(0))[0],
3056 s)
b53466e1 3057
d05cfe06
S
3058
3059def escape_rfc3986(s):
3060 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3061 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3062
3063
3064def escape_url(url):
3065 """Escape URL as suggested by RFC 3986"""
14f25df2 3066 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3067 return url_parsed._replace(
efbed08d 3068 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3069 path=escape_rfc3986(url_parsed.path),
3070 params=escape_rfc3986(url_parsed.params),
3071 query=escape_rfc3986(url_parsed.query),
3072 fragment=escape_rfc3986(url_parsed.fragment)
3073 ).geturl()
3074
62e609ab 3075
4dfbf869 3076def parse_qs(url):
14f25df2 3077 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
4dfbf869 3078
3079
62e609ab
PH
3080def read_batch_urls(batch_fd):
3081 def fixup(url):
14f25df2 3082 if not isinstance(url, str):
62e609ab 3083 url = url.decode('utf-8', 'replace')
8c04f0be 3084 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3085 for bom in BOM_UTF8:
3086 if url.startswith(bom):
3087 url = url[len(bom):]
3088 url = url.lstrip()
3089 if not url or url.startswith(('#', ';', ']')):
62e609ab 3090 return False
8c04f0be 3091 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3092 # However, it can be safely stripped out if following a whitespace
8c04f0be 3093 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3094
3095 with contextlib.closing(batch_fd) as fd:
3096 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3097
3098
3099def urlencode_postdata(*args, **kargs):
14f25df2 3100 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3101
3102
38f9ef31 3103def update_url_query(url, query):
cacd9966
YCH
3104 if not query:
3105 return url
14f25df2 3106 parsed_url = urllib.parse.urlparse(url)
3107 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3108 qs.update(query)
14f25df2 3109 return urllib.parse.urlunparse(parsed_url._replace(
3110 query=urllib.parse.urlencode(qs, True)))
16392824 3111
8e60dc75 3112
c043c246 3113def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3114 req_headers = req.headers.copy()
c043c246 3115 req_headers.update(headers or {})
ed0291d1
S
3116 req_data = data or req.data
3117 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3118 req_get_method = req.get_method()
3119 if req_get_method == 'HEAD':
3120 req_type = HEADRequest
3121 elif req_get_method == 'PUT':
3122 req_type = PUTRequest
3123 else:
ac668111 3124 req_type = urllib.request.Request
ed0291d1
S
3125 new_req = req_type(
3126 req_url, data=req_data, headers=req_headers,
3127 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3128 if hasattr(req, 'timeout'):
3129 new_req.timeout = req.timeout
3130 return new_req
3131
3132
10c87c15 3133def _multipart_encode_impl(data, boundary):
0c265486
YCH
3134 content_type = 'multipart/form-data; boundary=%s' % boundary
3135
3136 out = b''
3137 for k, v in data.items():
3138 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3139 if isinstance(k, str):
0f06bcd7 3140 k = k.encode()
14f25df2 3141 if isinstance(v, str):
0f06bcd7 3142 v = v.encode()
0c265486
YCH
3143 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3144 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3145 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3146 if boundary.encode('ascii') in content:
3147 raise ValueError('Boundary overlaps with data')
3148 out += content
3149
3150 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3151
3152 return out, content_type
3153
3154
3155def multipart_encode(data, boundary=None):
3156 '''
3157 Encode a dict to RFC 7578-compliant form-data
3158
3159 data:
3160 A dict where keys and values can be either Unicode or bytes-like
3161 objects.
3162 boundary:
3163 If specified a Unicode object, it's used as the boundary. Otherwise
3164 a random boundary is generated.
3165
3166 Reference: https://tools.ietf.org/html/rfc7578
3167 '''
3168 has_specified_boundary = boundary is not None
3169
3170 while True:
3171 if boundary is None:
3172 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3173
3174 try:
10c87c15 3175 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3176 break
3177 except ValueError:
3178 if has_specified_boundary:
3179 raise
3180 boundary = None
3181
3182 return out, content_type
3183
3184
304ad45a 3185def variadic(x, allowed_types=(str, bytes, dict)):
3186 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3187
3188
86296ad2 3189def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3190 for val in map(d.get, variadic(key_or_keys)):
3191 if val is not None and (val or not skip_false_values):
3192 return val
3193 return default
cbecc9b9
S
3194
3195
c4f60dd7 3196def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3197 for f in funcs:
a32a9a7e 3198 try:
c4f60dd7 3199 val = f(*args, **kwargs)
ab029d7e 3200 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3201 pass
3202 else:
c4f60dd7 3203 if expected_type is None or isinstance(val, expected_type):
3204 return val
3205
3206
3207def try_get(src, getter, expected_type=None):
3208 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3209
3210
90137ca4 3211def filter_dict(dct, cndn=lambda _, v: v is not None):
3212 return {k: v for k, v in dct.items() if cndn(k, v)}
3213
3214
6cc62232
S
3215def merge_dicts(*dicts):
3216 merged = {}
3217 for a_dict in dicts:
3218 for k, v in a_dict.items():
90137ca4 3219 if (v is not None and k not in merged
3220 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3221 merged[k] = v
3222 return merged
3223
3224
8e60dc75 3225def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3226 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3227
16392824 3228
a1a530b0
PH
3229US_RATINGS = {
3230 'G': 0,
3231 'PG': 10,
3232 'PG-13': 13,
3233 'R': 16,
3234 'NC': 18,
3235}
fac55558
PH
3236
3237
a8795327 3238TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3239 'TV-Y': 0,
3240 'TV-Y7': 7,
3241 'TV-G': 0,
3242 'TV-PG': 0,
3243 'TV-14': 14,
3244 'TV-MA': 17,
a8795327
S
3245}
3246
3247
146c80e2 3248def parse_age_limit(s):
19a03940 3249 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3250 if type(s) is int: # noqa: E721
a8795327 3251 return s if 0 <= s <= 21 else None
19a03940 3252 elif not isinstance(s, str):
d838b1bd 3253 return None
146c80e2 3254 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3255 if m:
3256 return int(m.group('age'))
5c5fae6d 3257 s = s.upper()
a8795327
S
3258 if s in US_RATINGS:
3259 return US_RATINGS[s]
5a16c9d9 3260 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3261 if m:
5a16c9d9 3262 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3263 return None
146c80e2
S
3264
3265
fac55558 3266def strip_jsonp(code):
609a61e3 3267 return re.sub(
5552c9eb 3268 r'''(?sx)^
e9c671d5 3269 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3270 (?:\s*&&\s*(?P=func_name))?
3271 \s*\(\s*(?P<callback_data>.*)\);?
3272 \s*?(?://[^\n]*)*$''',
3273 r'\g<callback_data>', code)
478c2c61
PH
3274
3275
8f53dc44 3276def js_to_json(code, vars={}, *, strict=False):
5c610515 3277 # vars is a dict of var, val pairs to substitute
a71b812f
SS
3278 STRING_QUOTES = '\'"'
3279 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3280 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3281 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3282 INTEGER_TABLE = (
86e5f3ed 3283 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3284 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3285 )
3286
a71b812f
SS
3287 def process_escape(match):
3288 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3289 escape = match.group(1) or match.group(2)
3290
3291 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3292 else R'\u00' if escape == 'x'
3293 else '' if escape == '\n'
3294 else escape)
3295
e05f6939 3296 def fix_kv(m):
e7b6d122
PH
3297 v = m.group(0)
3298 if v in ('true', 'false', 'null'):
3299 return v
421ddcb8
C
3300 elif v in ('undefined', 'void 0'):
3301 return 'null'
8bdd16b4 3302 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3303 return ''
3304
3305 if v[0] in STRING_QUOTES:
3306 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3307 return f'"{escaped}"'
3308
3309 for regex, base in INTEGER_TABLE:
3310 im = re.match(regex, v)
3311 if im:
3312 i = int(im.group(1), base)
3313 return f'"{i}":' if v.endswith(':') else str(i)
3314
3315 if v in vars:
3316 return json.dumps(vars[v])
89ac4a19 3317
a71b812f
SS
3318 if not strict:
3319 return f'"{v}"'
5c610515 3320
a71b812f 3321 raise ValueError(f'Unknown value: {v}')
e05f6939 3322
8072ef2b 3323 def create_map(mobj):
3324 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3325
8072ef2b 3326 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3327 if not strict:
3328 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3329 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
febff4c1 3330
a71b812f
SS
3331 return re.sub(rf'''(?sx)
3332 {STRING_RE}|
3333 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3334 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3335 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3336 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3337 !+
a71b812f 3338 ''', fix_kv, code)
e05f6939
PH
3339
3340
478c2c61
PH
3341def qualities(quality_ids):
3342 """ Get a numeric quality value out of a list of possible values """
3343 def q(qid):
3344 try:
3345 return quality_ids.index(qid)
3346 except ValueError:
3347 return -1
3348 return q
3349
acd69589 3350
8aa0e7cd 3351POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3352
3353
de6000d9 3354DEFAULT_OUTTMPL = {
3355 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3356 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3357}
3358OUTTMPL_TYPES = {
72755351 3359 'chapter': None,
de6000d9 3360 'subtitle': None,
3361 'thumbnail': None,
3362 'description': 'description',
3363 'annotation': 'annotations.xml',
3364 'infojson': 'info.json',
08438d2c 3365 'link': None,
3b603dbd 3366 'pl_video': None,
5112f26a 3367 'pl_thumbnail': None,
de6000d9 3368 'pl_description': 'description',
3369 'pl_infojson': 'info.json',
3370}
0a871f68 3371
143db31d 3372# As of [1] format syntax is:
3373# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3374# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3375STR_FORMAT_RE_TMPL = r'''(?x)
3376 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3377 %
524e2e4f 3378 (?P<has_key>\((?P<key>{0})\))?
752cda38 3379 (?P<format>
524e2e4f 3380 (?P<conversion>[#0\-+ ]+)?
3381 (?P<min_width>\d+)?
3382 (?P<precision>\.\d+)?
3383 (?P<len_mod>[hlL])? # unused in python
901130bb 3384 {1} # conversion type
752cda38 3385 )
143db31d 3386'''
3387
7d1eb38a 3388
901130bb 3389STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3390
7d1eb38a 3391
a020a0dc
PH
3392def limit_length(s, length):
3393 """ Add ellipses to overly long strings """
3394 if s is None:
3395 return None
3396 ELLIPSES = '...'
3397 if len(s) > length:
3398 return s[:length - len(ELLIPSES)] + ELLIPSES
3399 return s
48844745
PH
3400
3401
3402def version_tuple(v):
5f9b8394 3403 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3404
3405
3406def is_outdated_version(version, limit, assume_new=True):
3407 if not version:
3408 return not assume_new
3409 try:
3410 return version_tuple(version) < version_tuple(limit)
3411 except ValueError:
3412 return not assume_new
732ea2f0
PH
3413
3414
3415def ytdl_is_updateable():
7a5c1cfe 3416 """ Returns if yt-dlp can be updated with -U """
735d865e 3417
5d535b4a 3418 from .update import is_non_updateable
732ea2f0 3419
5d535b4a 3420 return not is_non_updateable()
7d4111ed
PH
3421
3422
3423def args_to_str(args):
3424 # Get a short string representation for a subprocess command
702ccf2d 3425 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3426
3427
9b9c5355 3428def error_to_compat_str(err):
cfb0511d 3429 return str(err)
fdae2358
S
3430
3431
a44ca5a4 3432def error_to_str(err):
3433 return f'{type(err).__name__}: {err}'
3434
3435
c460bdd5 3436def mimetype2ext(mt):
eb9ee194
S
3437 if mt is None:
3438 return None
3439
9359f3d4
F
3440 mt, _, params = mt.partition(';')
3441 mt = mt.strip()
3442
3443 FULL_MAP = {
765ac263 3444 'audio/mp4': 'm4a',
6c33d24b
YCH
3445 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3446 # it's the most popular one
3447 'audio/mpeg': 'mp3',
ba39289d 3448 'audio/x-wav': 'wav',
9359f3d4
F
3449 'audio/wav': 'wav',
3450 'audio/wave': 'wav',
3451 }
3452
3453 ext = FULL_MAP.get(mt)
765ac263
JMF
3454 if ext is not None:
3455 return ext
3456
9359f3d4 3457 SUBTYPE_MAP = {
f6861ec9 3458 '3gpp': '3gp',
cafcf657 3459 'smptett+xml': 'tt',
cafcf657 3460 'ttaf+xml': 'dfxp',
a0d8d704 3461 'ttml+xml': 'ttml',
f6861ec9 3462 'x-flv': 'flv',
a0d8d704 3463 'x-mp4-fragmented': 'mp4',
d4f05d47 3464 'x-ms-sami': 'sami',
a0d8d704 3465 'x-ms-wmv': 'wmv',
b4173f15
RA
3466 'mpegurl': 'm3u8',
3467 'x-mpegurl': 'm3u8',
3468 'vnd.apple.mpegurl': 'm3u8',
3469 'dash+xml': 'mpd',
b4173f15 3470 'f4m+xml': 'f4m',
f164b971 3471 'hds+xml': 'f4m',
e910fe2f 3472 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3473 'quicktime': 'mov',
98ce1a3f 3474 'mp2t': 'ts',
39e7107d 3475 'x-wav': 'wav',
9359f3d4
F
3476 'filmstrip+json': 'fs',
3477 'svg+xml': 'svg',
3478 }
3479
3480 _, _, subtype = mt.rpartition('/')
3481 ext = SUBTYPE_MAP.get(subtype.lower())
3482 if ext is not None:
3483 return ext
3484
3485 SUFFIX_MAP = {
3486 'json': 'json',
3487 'xml': 'xml',
3488 'zip': 'zip',
3489 'gzip': 'gz',
3490 }
3491
3492 _, _, suffix = subtype.partition('+')
3493 ext = SUFFIX_MAP.get(suffix)
3494 if ext is not None:
3495 return ext
3496
3497 return subtype.replace('+', '.')
c460bdd5
PH
3498
3499
2814f12b
THD
3500def ext2mimetype(ext_or_url):
3501 if not ext_or_url:
3502 return None
3503 if '.' not in ext_or_url:
3504 ext_or_url = f'file.{ext_or_url}'
3505 return mimetypes.guess_type(ext_or_url)[0]
3506
3507
4f3c5e06 3508def parse_codecs(codecs_str):
3509 # http://tools.ietf.org/html/rfc6381
3510 if not codecs_str:
3511 return {}
a0566bbf 3512 split_codecs = list(filter(None, map(
dbf5416a 3513 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3514 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3515 for full_codec in split_codecs:
d816f61f 3516 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3517 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3518 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3519 if vcodec:
3520 continue
3521 vcodec = full_codec
3522 if parts[0] in ('dvh1', 'dvhe'):
3523 hdr = 'DV'
3524 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3525 hdr = 'HDR10'
3526 elif parts[:2] == ['vp9', '2']:
3527 hdr = 'HDR10'
3528 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3529 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3530 acodec = acodec or full_codec
3531 elif parts[0] in ('stpp', 'wvtt'):
3532 scodec = scodec or full_codec
4f3c5e06 3533 else:
19a03940 3534 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3535 if vcodec or acodec or scodec:
4f3c5e06 3536 return {
3537 'vcodec': vcodec or 'none',
3538 'acodec': acodec or 'none',
176f1866 3539 'dynamic_range': hdr,
3fe75fdc 3540 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3541 }
b69fd25c 3542 elif len(split_codecs) == 2:
3543 return {
3544 'vcodec': split_codecs[0],
3545 'acodec': split_codecs[1],
3546 }
4f3c5e06 3547 return {}
3548
3549
fc61aff4
LL
3550def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3551 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3552
3553 allow_mkv = not preferences or 'mkv' in preferences
3554
3555 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3556 return 'mkv' # TODO: any other format allows this?
3557
3558 # TODO: All codecs supported by parse_codecs isn't handled here
3559 COMPATIBLE_CODECS = {
3560 'mp4': {
3561 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
81b6102d 3562 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3563 },
3564 'webm': {
3565 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3566 'vp9x', 'vp8x', # in the webm spec
3567 },
3568 }
3569
8f84770a 3570 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3571 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3572
3573 for ext in preferences or COMPATIBLE_CODECS.keys():
3574 codec_set = COMPATIBLE_CODECS.get(ext, set())
3575 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3576 return ext
3577
3578 COMPATIBLE_EXTS = (
3579 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3580 {'webm'},
3581 )
3582 for ext in preferences or vexts:
3583 current_exts = {ext, *vexts, *aexts}
3584 if ext == 'mkv' or current_exts == {ext} or any(
3585 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3586 return ext
3587 return 'mkv' if allow_mkv else preferences[-1]
3588
3589
2ccd1b10 3590def urlhandle_detect_ext(url_handle):
79298173 3591 getheader = url_handle.headers.get
2ccd1b10 3592
b55ee18f
PH
3593 cd = getheader('Content-Disposition')
3594 if cd:
3595 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3596 if m:
3597 e = determine_ext(m.group('filename'), default_ext=None)
3598 if e:
3599 return e
3600
c460bdd5 3601 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3602
3603
1e399778
YCH
3604def encode_data_uri(data, mime_type):
3605 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3606
3607
05900629 3608def age_restricted(content_limit, age_limit):
6ec6cb4e 3609 """ Returns True iff the content should be blocked """
05900629
PH
3610
3611 if age_limit is None: # No limit set
3612 return False
3613 if content_limit is None:
3614 return False # Content available for everyone
3615 return age_limit < content_limit
61ca9a80
PH
3616
3617
88f60feb 3618# List of known byte-order-marks (BOM)
a904a7f8
L
3619BOMS = [
3620 (b'\xef\xbb\xbf', 'utf-8'),
3621 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3622 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3623 (b'\xff\xfe', 'utf-16-le'),
3624 (b'\xfe\xff', 'utf-16-be'),
3625]
a904a7f8
L
3626
3627
61ca9a80
PH
3628def is_html(first_bytes):
3629 """ Detect whether a file contains HTML by examining its first bytes. """
3630
80e8493e 3631 encoding = 'utf-8'
61ca9a80 3632 for bom, enc in BOMS:
80e8493e 3633 while first_bytes.startswith(bom):
3634 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3635
80e8493e 3636 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3637
3638
3639def determine_protocol(info_dict):
3640 protocol = info_dict.get('protocol')
3641 if protocol is not None:
3642 return protocol
3643
7de837a5 3644 url = sanitize_url(info_dict['url'])
a055469f
PH
3645 if url.startswith('rtmp'):
3646 return 'rtmp'
3647 elif url.startswith('mms'):
3648 return 'mms'
3649 elif url.startswith('rtsp'):
3650 return 'rtsp'
3651
3652 ext = determine_ext(url)
3653 if ext == 'm3u8':
deae7c17 3654 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3655 elif ext == 'f4m':
3656 return 'f4m'
3657
14f25df2 3658 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3659
3660
c5e3f849 3661def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3662 """ Render a list of rows, each as a list of values.
3663 Text after a \t will be right aligned """
ec11a9f4 3664 def width(string):
c5e3f849 3665 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3666
3667 def get_max_lens(table):
ec11a9f4 3668 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3669
3670 def filter_using_list(row, filterArray):
d16df59d 3671 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3672
d16df59d 3673 max_lens = get_max_lens(data) if hide_empty else []
3674 header_row = filter_using_list(header_row, max_lens)
3675 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3676
cfb56d1a 3677 table = [header_row] + data
76d321f6 3678 max_lens = get_max_lens(table)
c5e3f849 3679 extra_gap += 1
76d321f6 3680 if delim:
c5e3f849 3681 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3682 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3683 for row in table:
3684 for pos, text in enumerate(map(str, row)):
c5e3f849 3685 if '\t' in text:
3686 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3687 else:
3688 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3689 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3690 return ret
347de493
PH
3691
3692
8f18aca8 3693def _match_one(filter_part, dct, incomplete):
77b87f05 3694 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3695 STRING_OPERATORS = {
3696 '*=': operator.contains,
3697 '^=': lambda attr, value: attr.startswith(value),
3698 '$=': lambda attr, value: attr.endswith(value),
3699 '~=': lambda attr, value: re.search(value, attr),
3700 }
347de493 3701 COMPARISON_OPERATORS = {
a047eeb6 3702 **STRING_OPERATORS,
3703 '<=': operator.le, # "<=" must be defined above "<"
347de493 3704 '<': operator.lt,
347de493 3705 '>=': operator.ge,
a047eeb6 3706 '>': operator.gt,
347de493 3707 '=': operator.eq,
347de493 3708 }
a047eeb6 3709
6db9c4d5 3710 if isinstance(incomplete, bool):
3711 is_incomplete = lambda _: incomplete
3712 else:
3713 is_incomplete = lambda k: k in incomplete
3714
64fa820c 3715 operator_rex = re.compile(r'''(?x)
347de493 3716 (?P<key>[a-z_]+)
77b87f05 3717 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3718 (?:
a047eeb6 3719 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3720 (?P<strval>.+?)
347de493 3721 )
347de493 3722 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3723 m = operator_rex.fullmatch(filter_part.strip())
347de493 3724 if m:
18f96d12 3725 m = m.groupdict()
3726 unnegated_op = COMPARISON_OPERATORS[m['op']]
3727 if m['negation']:
77b87f05
MT
3728 op = lambda attr, value: not unnegated_op(attr, value)
3729 else:
3730 op = unnegated_op
18f96d12 3731 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3732 if m['quote']:
3733 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3734 actual_value = dct.get(m['key'])
3735 numeric_comparison = None
f9934b96 3736 if isinstance(actual_value, (int, float)):
e5a088dc
S
3737 # If the original field is a string and matching comparisonvalue is
3738 # a number we should respect the origin of the original field
3739 # and process comparison value as a string (see
18f96d12 3740 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3741 try:
18f96d12 3742 numeric_comparison = int(comparison_value)
347de493 3743 except ValueError:
18f96d12 3744 numeric_comparison = parse_filesize(comparison_value)
3745 if numeric_comparison is None:
3746 numeric_comparison = parse_filesize(f'{comparison_value}B')
3747 if numeric_comparison is None:
3748 numeric_comparison = parse_duration(comparison_value)
3749 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3750 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3751 if actual_value is None:
6db9c4d5 3752 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3753 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3754
3755 UNARY_OPERATORS = {
1cc47c66
S
3756 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3757 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3758 }
64fa820c 3759 operator_rex = re.compile(r'''(?x)
347de493 3760 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3761 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3762 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3763 if m:
3764 op = UNARY_OPERATORS[m.group('op')]
3765 actual_value = dct.get(m.group('key'))
6db9c4d5 3766 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3767 return True
347de493
PH
3768 return op(actual_value)
3769
3770 raise ValueError('Invalid filter part %r' % filter_part)
3771
3772
8f18aca8 3773def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3774 """ Filter a dictionary with a simple string syntax.
3775 @returns Whether the filter passes
3776 @param incomplete Set of keys that is expected to be missing from dct.
3777 Can be True/False to indicate all/none of the keys may be missing.
3778 All conditions on incomplete keys pass if the key is missing
8f18aca8 3779 """
347de493 3780 return all(
8f18aca8 3781 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3782 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3783
3784
b1a7cd05 3785def match_filter_func(filters):
3786 if not filters:
d1b5f70b 3787 return None
492272fe 3788 filters = set(variadic(filters))
d1b5f70b 3789
492272fe 3790 interactive = '-' in filters
3791 if interactive:
3792 filters.remove('-')
3793
3794 def _match_func(info_dict, incomplete=False):
3795 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3796 return NO_DEFAULT if interactive and not incomplete else None
347de493 3797 else:
3bec830a 3798 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3799 filter_str = ') | ('.join(map(str.strip, filters))
3800 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3801 return _match_func
91410c9b
PH
3802
3803
f2df4071 3804class download_range_func:
3805 def __init__(self, chapters, ranges):
3806 self.chapters, self.ranges = chapters, ranges
3807
3808 def __call__(self, info_dict, ydl):
0500ee3d 3809 if not self.ranges and not self.chapters:
3810 yield {}
3811
5ec1b6b7 3812 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3813 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3814 for regex in self.chapters or []:
5ec1b6b7 3815 for i, chapter in enumerate(info_dict.get('chapters') or []):
3816 if re.search(regex, chapter['title']):
3817 warning = None
3818 yield {**chapter, 'index': i}
f2df4071 3819 if self.chapters and warning:
5ec1b6b7 3820 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3821
f2df4071 3822 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3823
f2df4071 3824 def __eq__(self, other):
3825 return (isinstance(other, download_range_func)
3826 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3827
3828
bf6427d2
YCH
3829def parse_dfxp_time_expr(time_expr):
3830 if not time_expr:
d631d5f9 3831 return
bf6427d2 3832
1d485a1a 3833 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3834 if mobj:
3835 return float(mobj.group('time_offset'))
3836
db2fe38b 3837 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3838 if mobj:
db2fe38b 3839 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3840
3841
c1c924ab 3842def srt_subtitles_timecode(seconds):
aa7785f8 3843 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3844
3845
3846def ass_subtitles_timecode(seconds):
3847 time = timetuple_from_msec(seconds * 1000)
3848 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3849
3850
3851def dfxp2srt(dfxp_data):
3869028f
YCH
3852 '''
3853 @param dfxp_data A bytes-like object containing DFXP data
3854 @returns A unicode object containing converted SRT data
3855 '''
5b995f71 3856 LEGACY_NAMESPACES = (
3869028f
YCH
3857 (b'http://www.w3.org/ns/ttml', [
3858 b'http://www.w3.org/2004/11/ttaf1',
3859 b'http://www.w3.org/2006/04/ttaf1',
3860 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3861 ]),
3869028f
YCH
3862 (b'http://www.w3.org/ns/ttml#styling', [
3863 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3864 ]),
3865 )
3866
3867 SUPPORTED_STYLING = [
3868 'color',
3869 'fontFamily',
3870 'fontSize',
3871 'fontStyle',
3872 'fontWeight',
3873 'textDecoration'
3874 ]
3875
4e335771 3876 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3877 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3878 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3879 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3880 })
bf6427d2 3881
5b995f71
RA
3882 styles = {}
3883 default_style = {}
3884
86e5f3ed 3885 class TTMLPElementParser:
5b995f71
RA
3886 _out = ''
3887 _unclosed_elements = []
3888 _applied_styles = []
bf6427d2 3889
2b14cb56 3890 def start(self, tag, attrib):
5b995f71
RA
3891 if tag in (_x('ttml:br'), 'br'):
3892 self._out += '\n'
3893 else:
3894 unclosed_elements = []
3895 style = {}
3896 element_style_id = attrib.get('style')
3897 if default_style:
3898 style.update(default_style)
3899 if element_style_id:
3900 style.update(styles.get(element_style_id, {}))
3901 for prop in SUPPORTED_STYLING:
3902 prop_val = attrib.get(_x('tts:' + prop))
3903 if prop_val:
3904 style[prop] = prop_val
3905 if style:
3906 font = ''
3907 for k, v in sorted(style.items()):
3908 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3909 continue
3910 if k == 'color':
3911 font += ' color="%s"' % v
3912 elif k == 'fontSize':
3913 font += ' size="%s"' % v
3914 elif k == 'fontFamily':
3915 font += ' face="%s"' % v
3916 elif k == 'fontWeight' and v == 'bold':
3917 self._out += '<b>'
3918 unclosed_elements.append('b')
3919 elif k == 'fontStyle' and v == 'italic':
3920 self._out += '<i>'
3921 unclosed_elements.append('i')
3922 elif k == 'textDecoration' and v == 'underline':
3923 self._out += '<u>'
3924 unclosed_elements.append('u')
3925 if font:
3926 self._out += '<font' + font + '>'
3927 unclosed_elements.append('font')
3928 applied_style = {}
3929 if self._applied_styles:
3930 applied_style.update(self._applied_styles[-1])
3931 applied_style.update(style)
3932 self._applied_styles.append(applied_style)
3933 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3934
2b14cb56 3935 def end(self, tag):
5b995f71
RA
3936 if tag not in (_x('ttml:br'), 'br'):
3937 unclosed_elements = self._unclosed_elements.pop()
3938 for element in reversed(unclosed_elements):
3939 self._out += '</%s>' % element
3940 if unclosed_elements and self._applied_styles:
3941 self._applied_styles.pop()
bf6427d2 3942
2b14cb56 3943 def data(self, data):
5b995f71 3944 self._out += data
2b14cb56 3945
3946 def close(self):
5b995f71 3947 return self._out.strip()
2b14cb56 3948
3949 def parse_node(node):
3950 target = TTMLPElementParser()
3951 parser = xml.etree.ElementTree.XMLParser(target=target)
3952 parser.feed(xml.etree.ElementTree.tostring(node))
3953 return parser.close()
bf6427d2 3954
5b995f71
RA
3955 for k, v in LEGACY_NAMESPACES:
3956 for ns in v:
3957 dfxp_data = dfxp_data.replace(ns, k)
3958
3869028f 3959 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3960 out = []
5b995f71 3961 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3962
3963 if not paras:
3964 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3965
5b995f71
RA
3966 repeat = False
3967 while True:
3968 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3969 style_id = style.get('id') or style.get(_x('xml:id'))
3970 if not style_id:
3971 continue
5b995f71
RA
3972 parent_style_id = style.get('style')
3973 if parent_style_id:
3974 if parent_style_id not in styles:
3975 repeat = True
3976 continue
3977 styles[style_id] = styles[parent_style_id].copy()
3978 for prop in SUPPORTED_STYLING:
3979 prop_val = style.get(_x('tts:' + prop))
3980 if prop_val:
3981 styles.setdefault(style_id, {})[prop] = prop_val
3982 if repeat:
3983 repeat = False
3984 else:
3985 break
3986
3987 for p in ('body', 'div'):
3988 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3989 if ele is None:
3990 continue
3991 style = styles.get(ele.get('style'))
3992 if not style:
3993 continue
3994 default_style.update(style)
3995
bf6427d2 3996 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3997 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3998 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3999 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4000 if begin_time is None:
4001 continue
7dff0363 4002 if not end_time:
d631d5f9
YCH
4003 if not dur:
4004 continue
4005 end_time = begin_time + dur
bf6427d2
YCH
4006 out.append('%d\n%s --> %s\n%s\n\n' % (
4007 index,
c1c924ab
YCH
4008 srt_subtitles_timecode(begin_time),
4009 srt_subtitles_timecode(end_time),
bf6427d2
YCH
4010 parse_node(para)))
4011
4012 return ''.join(out)
4013
4014
c487cf00 4015def cli_option(params, command_option, param, separator=None):
66e289ba 4016 param = params.get(param)
c487cf00 4017 return ([] if param is None
4018 else [command_option, str(param)] if separator is None
4019 else [f'{command_option}{separator}{param}'])
66e289ba
S
4020
4021
4022def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4023 param = params.get(param)
c487cf00 4024 assert param in (True, False, None)
4025 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4026
4027
4028def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4029 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4030
4031
e92caff5 4032def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4033 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4034 if use_compat:
5b1ecbb3 4035 return argdict
4036 else:
4037 argdict = None
eab9b2bc 4038 if argdict is None:
5b1ecbb3 4039 return default
eab9b2bc 4040 assert isinstance(argdict, dict)
4041
e92caff5 4042 assert isinstance(keys, (list, tuple))
4043 for key_list in keys:
e92caff5 4044 arg_list = list(filter(
4045 lambda x: x is not None,
6606817a 4046 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4047 if arg_list:
4048 return [arg for args in arg_list for arg in args]
4049 return default
66e289ba 4050
6251555f 4051
330690a2 4052def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4053 main_key, exe = main_key.lower(), exe.lower()
4054 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4055 keys = [f'{root_key}{k}' for k in (keys or [''])]
4056 if root_key in keys:
4057 if main_key != exe:
4058 keys.append((main_key, exe))
4059 keys.append('default')
4060 else:
4061 use_compat = False
4062 return cli_configuration_args(argdict, keys, default, use_compat)
4063
66e289ba 4064
86e5f3ed 4065class ISO639Utils:
39672624
YCH
4066 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4067 _lang_map = {
4068 'aa': 'aar',
4069 'ab': 'abk',
4070 'ae': 'ave',
4071 'af': 'afr',
4072 'ak': 'aka',
4073 'am': 'amh',
4074 'an': 'arg',
4075 'ar': 'ara',
4076 'as': 'asm',
4077 'av': 'ava',
4078 'ay': 'aym',
4079 'az': 'aze',
4080 'ba': 'bak',
4081 'be': 'bel',
4082 'bg': 'bul',
4083 'bh': 'bih',
4084 'bi': 'bis',
4085 'bm': 'bam',
4086 'bn': 'ben',
4087 'bo': 'bod',
4088 'br': 'bre',
4089 'bs': 'bos',
4090 'ca': 'cat',
4091 'ce': 'che',
4092 'ch': 'cha',
4093 'co': 'cos',
4094 'cr': 'cre',
4095 'cs': 'ces',
4096 'cu': 'chu',
4097 'cv': 'chv',
4098 'cy': 'cym',
4099 'da': 'dan',
4100 'de': 'deu',
4101 'dv': 'div',
4102 'dz': 'dzo',
4103 'ee': 'ewe',
4104 'el': 'ell',
4105 'en': 'eng',
4106 'eo': 'epo',
4107 'es': 'spa',
4108 'et': 'est',
4109 'eu': 'eus',
4110 'fa': 'fas',
4111 'ff': 'ful',
4112 'fi': 'fin',
4113 'fj': 'fij',
4114 'fo': 'fao',
4115 'fr': 'fra',
4116 'fy': 'fry',
4117 'ga': 'gle',
4118 'gd': 'gla',
4119 'gl': 'glg',
4120 'gn': 'grn',
4121 'gu': 'guj',
4122 'gv': 'glv',
4123 'ha': 'hau',
4124 'he': 'heb',
b7acc835 4125 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4126 'hi': 'hin',
4127 'ho': 'hmo',
4128 'hr': 'hrv',
4129 'ht': 'hat',
4130 'hu': 'hun',
4131 'hy': 'hye',
4132 'hz': 'her',
4133 'ia': 'ina',
4134 'id': 'ind',
b7acc835 4135 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4136 'ie': 'ile',
4137 'ig': 'ibo',
4138 'ii': 'iii',
4139 'ik': 'ipk',
4140 'io': 'ido',
4141 'is': 'isl',
4142 'it': 'ita',
4143 'iu': 'iku',
4144 'ja': 'jpn',
4145 'jv': 'jav',
4146 'ka': 'kat',
4147 'kg': 'kon',
4148 'ki': 'kik',
4149 'kj': 'kua',
4150 'kk': 'kaz',
4151 'kl': 'kal',
4152 'km': 'khm',
4153 'kn': 'kan',
4154 'ko': 'kor',
4155 'kr': 'kau',
4156 'ks': 'kas',
4157 'ku': 'kur',
4158 'kv': 'kom',
4159 'kw': 'cor',
4160 'ky': 'kir',
4161 'la': 'lat',
4162 'lb': 'ltz',
4163 'lg': 'lug',
4164 'li': 'lim',
4165 'ln': 'lin',
4166 'lo': 'lao',
4167 'lt': 'lit',
4168 'lu': 'lub',
4169 'lv': 'lav',
4170 'mg': 'mlg',
4171 'mh': 'mah',
4172 'mi': 'mri',
4173 'mk': 'mkd',
4174 'ml': 'mal',
4175 'mn': 'mon',
4176 'mr': 'mar',
4177 'ms': 'msa',
4178 'mt': 'mlt',
4179 'my': 'mya',
4180 'na': 'nau',
4181 'nb': 'nob',
4182 'nd': 'nde',
4183 'ne': 'nep',
4184 'ng': 'ndo',
4185 'nl': 'nld',
4186 'nn': 'nno',
4187 'no': 'nor',
4188 'nr': 'nbl',
4189 'nv': 'nav',
4190 'ny': 'nya',
4191 'oc': 'oci',
4192 'oj': 'oji',
4193 'om': 'orm',
4194 'or': 'ori',
4195 'os': 'oss',
4196 'pa': 'pan',
4197 'pi': 'pli',
4198 'pl': 'pol',
4199 'ps': 'pus',
4200 'pt': 'por',
4201 'qu': 'que',
4202 'rm': 'roh',
4203 'rn': 'run',
4204 'ro': 'ron',
4205 'ru': 'rus',
4206 'rw': 'kin',
4207 'sa': 'san',
4208 'sc': 'srd',
4209 'sd': 'snd',
4210 'se': 'sme',
4211 'sg': 'sag',
4212 'si': 'sin',
4213 'sk': 'slk',
4214 'sl': 'slv',
4215 'sm': 'smo',
4216 'sn': 'sna',
4217 'so': 'som',
4218 'sq': 'sqi',
4219 'sr': 'srp',
4220 'ss': 'ssw',
4221 'st': 'sot',
4222 'su': 'sun',
4223 'sv': 'swe',
4224 'sw': 'swa',
4225 'ta': 'tam',
4226 'te': 'tel',
4227 'tg': 'tgk',
4228 'th': 'tha',
4229 'ti': 'tir',
4230 'tk': 'tuk',
4231 'tl': 'tgl',
4232 'tn': 'tsn',
4233 'to': 'ton',
4234 'tr': 'tur',
4235 'ts': 'tso',
4236 'tt': 'tat',
4237 'tw': 'twi',
4238 'ty': 'tah',
4239 'ug': 'uig',
4240 'uk': 'ukr',
4241 'ur': 'urd',
4242 'uz': 'uzb',
4243 've': 'ven',
4244 'vi': 'vie',
4245 'vo': 'vol',
4246 'wa': 'wln',
4247 'wo': 'wol',
4248 'xh': 'xho',
4249 'yi': 'yid',
e9a50fba 4250 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4251 'yo': 'yor',
4252 'za': 'zha',
4253 'zh': 'zho',
4254 'zu': 'zul',
4255 }
4256
4257 @classmethod
4258 def short2long(cls, code):
4259 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4260 return cls._lang_map.get(code[:2])
4261
4262 @classmethod
4263 def long2short(cls, code):
4264 """Convert language code from ISO 639-2/T to ISO 639-1"""
4265 for short_name, long_name in cls._lang_map.items():
4266 if long_name == code:
4267 return short_name
4268
4269
86e5f3ed 4270class ISO3166Utils:
4eb10f66
YCH
4271 # From http://data.okfn.org/data/core/country-list
4272 _country_map = {
4273 'AF': 'Afghanistan',
4274 'AX': 'Åland Islands',
4275 'AL': 'Albania',
4276 'DZ': 'Algeria',
4277 'AS': 'American Samoa',
4278 'AD': 'Andorra',
4279 'AO': 'Angola',
4280 'AI': 'Anguilla',
4281 'AQ': 'Antarctica',
4282 'AG': 'Antigua and Barbuda',
4283 'AR': 'Argentina',
4284 'AM': 'Armenia',
4285 'AW': 'Aruba',
4286 'AU': 'Australia',
4287 'AT': 'Austria',
4288 'AZ': 'Azerbaijan',
4289 'BS': 'Bahamas',
4290 'BH': 'Bahrain',
4291 'BD': 'Bangladesh',
4292 'BB': 'Barbados',
4293 'BY': 'Belarus',
4294 'BE': 'Belgium',
4295 'BZ': 'Belize',
4296 'BJ': 'Benin',
4297 'BM': 'Bermuda',
4298 'BT': 'Bhutan',
4299 'BO': 'Bolivia, Plurinational State of',
4300 'BQ': 'Bonaire, Sint Eustatius and Saba',
4301 'BA': 'Bosnia and Herzegovina',
4302 'BW': 'Botswana',
4303 'BV': 'Bouvet Island',
4304 'BR': 'Brazil',
4305 'IO': 'British Indian Ocean Territory',
4306 'BN': 'Brunei Darussalam',
4307 'BG': 'Bulgaria',
4308 'BF': 'Burkina Faso',
4309 'BI': 'Burundi',
4310 'KH': 'Cambodia',
4311 'CM': 'Cameroon',
4312 'CA': 'Canada',
4313 'CV': 'Cape Verde',
4314 'KY': 'Cayman Islands',
4315 'CF': 'Central African Republic',
4316 'TD': 'Chad',
4317 'CL': 'Chile',
4318 'CN': 'China',
4319 'CX': 'Christmas Island',
4320 'CC': 'Cocos (Keeling) Islands',
4321 'CO': 'Colombia',
4322 'KM': 'Comoros',
4323 'CG': 'Congo',
4324 'CD': 'Congo, the Democratic Republic of the',
4325 'CK': 'Cook Islands',
4326 'CR': 'Costa Rica',
4327 'CI': 'Côte d\'Ivoire',
4328 'HR': 'Croatia',
4329 'CU': 'Cuba',
4330 'CW': 'Curaçao',
4331 'CY': 'Cyprus',
4332 'CZ': 'Czech Republic',
4333 'DK': 'Denmark',
4334 'DJ': 'Djibouti',
4335 'DM': 'Dominica',
4336 'DO': 'Dominican Republic',
4337 'EC': 'Ecuador',
4338 'EG': 'Egypt',
4339 'SV': 'El Salvador',
4340 'GQ': 'Equatorial Guinea',
4341 'ER': 'Eritrea',
4342 'EE': 'Estonia',
4343 'ET': 'Ethiopia',
4344 'FK': 'Falkland Islands (Malvinas)',
4345 'FO': 'Faroe Islands',
4346 'FJ': 'Fiji',
4347 'FI': 'Finland',
4348 'FR': 'France',
4349 'GF': 'French Guiana',
4350 'PF': 'French Polynesia',
4351 'TF': 'French Southern Territories',
4352 'GA': 'Gabon',
4353 'GM': 'Gambia',
4354 'GE': 'Georgia',
4355 'DE': 'Germany',
4356 'GH': 'Ghana',
4357 'GI': 'Gibraltar',
4358 'GR': 'Greece',
4359 'GL': 'Greenland',
4360 'GD': 'Grenada',
4361 'GP': 'Guadeloupe',
4362 'GU': 'Guam',
4363 'GT': 'Guatemala',
4364 'GG': 'Guernsey',
4365 'GN': 'Guinea',
4366 'GW': 'Guinea-Bissau',
4367 'GY': 'Guyana',
4368 'HT': 'Haiti',
4369 'HM': 'Heard Island and McDonald Islands',
4370 'VA': 'Holy See (Vatican City State)',
4371 'HN': 'Honduras',
4372 'HK': 'Hong Kong',
4373 'HU': 'Hungary',
4374 'IS': 'Iceland',
4375 'IN': 'India',
4376 'ID': 'Indonesia',
4377 'IR': 'Iran, Islamic Republic of',
4378 'IQ': 'Iraq',
4379 'IE': 'Ireland',
4380 'IM': 'Isle of Man',
4381 'IL': 'Israel',
4382 'IT': 'Italy',
4383 'JM': 'Jamaica',
4384 'JP': 'Japan',
4385 'JE': 'Jersey',
4386 'JO': 'Jordan',
4387 'KZ': 'Kazakhstan',
4388 'KE': 'Kenya',
4389 'KI': 'Kiribati',
4390 'KP': 'Korea, Democratic People\'s Republic of',
4391 'KR': 'Korea, Republic of',
4392 'KW': 'Kuwait',
4393 'KG': 'Kyrgyzstan',
4394 'LA': 'Lao People\'s Democratic Republic',
4395 'LV': 'Latvia',
4396 'LB': 'Lebanon',
4397 'LS': 'Lesotho',
4398 'LR': 'Liberia',
4399 'LY': 'Libya',
4400 'LI': 'Liechtenstein',
4401 'LT': 'Lithuania',
4402 'LU': 'Luxembourg',
4403 'MO': 'Macao',
4404 'MK': 'Macedonia, the Former Yugoslav Republic of',
4405 'MG': 'Madagascar',
4406 'MW': 'Malawi',
4407 'MY': 'Malaysia',
4408 'MV': 'Maldives',
4409 'ML': 'Mali',
4410 'MT': 'Malta',
4411 'MH': 'Marshall Islands',
4412 'MQ': 'Martinique',
4413 'MR': 'Mauritania',
4414 'MU': 'Mauritius',
4415 'YT': 'Mayotte',
4416 'MX': 'Mexico',
4417 'FM': 'Micronesia, Federated States of',
4418 'MD': 'Moldova, Republic of',
4419 'MC': 'Monaco',
4420 'MN': 'Mongolia',
4421 'ME': 'Montenegro',
4422 'MS': 'Montserrat',
4423 'MA': 'Morocco',
4424 'MZ': 'Mozambique',
4425 'MM': 'Myanmar',
4426 'NA': 'Namibia',
4427 'NR': 'Nauru',
4428 'NP': 'Nepal',
4429 'NL': 'Netherlands',
4430 'NC': 'New Caledonia',
4431 'NZ': 'New Zealand',
4432 'NI': 'Nicaragua',
4433 'NE': 'Niger',
4434 'NG': 'Nigeria',
4435 'NU': 'Niue',
4436 'NF': 'Norfolk Island',
4437 'MP': 'Northern Mariana Islands',
4438 'NO': 'Norway',
4439 'OM': 'Oman',
4440 'PK': 'Pakistan',
4441 'PW': 'Palau',
4442 'PS': 'Palestine, State of',
4443 'PA': 'Panama',
4444 'PG': 'Papua New Guinea',
4445 'PY': 'Paraguay',
4446 'PE': 'Peru',
4447 'PH': 'Philippines',
4448 'PN': 'Pitcairn',
4449 'PL': 'Poland',
4450 'PT': 'Portugal',
4451 'PR': 'Puerto Rico',
4452 'QA': 'Qatar',
4453 'RE': 'Réunion',
4454 'RO': 'Romania',
4455 'RU': 'Russian Federation',
4456 'RW': 'Rwanda',
4457 'BL': 'Saint Barthélemy',
4458 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4459 'KN': 'Saint Kitts and Nevis',
4460 'LC': 'Saint Lucia',
4461 'MF': 'Saint Martin (French part)',
4462 'PM': 'Saint Pierre and Miquelon',
4463 'VC': 'Saint Vincent and the Grenadines',
4464 'WS': 'Samoa',
4465 'SM': 'San Marino',
4466 'ST': 'Sao Tome and Principe',
4467 'SA': 'Saudi Arabia',
4468 'SN': 'Senegal',
4469 'RS': 'Serbia',
4470 'SC': 'Seychelles',
4471 'SL': 'Sierra Leone',
4472 'SG': 'Singapore',
4473 'SX': 'Sint Maarten (Dutch part)',
4474 'SK': 'Slovakia',
4475 'SI': 'Slovenia',
4476 'SB': 'Solomon Islands',
4477 'SO': 'Somalia',
4478 'ZA': 'South Africa',
4479 'GS': 'South Georgia and the South Sandwich Islands',
4480 'SS': 'South Sudan',
4481 'ES': 'Spain',
4482 'LK': 'Sri Lanka',
4483 'SD': 'Sudan',
4484 'SR': 'Suriname',
4485 'SJ': 'Svalbard and Jan Mayen',
4486 'SZ': 'Swaziland',
4487 'SE': 'Sweden',
4488 'CH': 'Switzerland',
4489 'SY': 'Syrian Arab Republic',
4490 'TW': 'Taiwan, Province of China',
4491 'TJ': 'Tajikistan',
4492 'TZ': 'Tanzania, United Republic of',
4493 'TH': 'Thailand',
4494 'TL': 'Timor-Leste',
4495 'TG': 'Togo',
4496 'TK': 'Tokelau',
4497 'TO': 'Tonga',
4498 'TT': 'Trinidad and Tobago',
4499 'TN': 'Tunisia',
4500 'TR': 'Turkey',
4501 'TM': 'Turkmenistan',
4502 'TC': 'Turks and Caicos Islands',
4503 'TV': 'Tuvalu',
4504 'UG': 'Uganda',
4505 'UA': 'Ukraine',
4506 'AE': 'United Arab Emirates',
4507 'GB': 'United Kingdom',
4508 'US': 'United States',
4509 'UM': 'United States Minor Outlying Islands',
4510 'UY': 'Uruguay',
4511 'UZ': 'Uzbekistan',
4512 'VU': 'Vanuatu',
4513 'VE': 'Venezuela, Bolivarian Republic of',
4514 'VN': 'Viet Nam',
4515 'VG': 'Virgin Islands, British',
4516 'VI': 'Virgin Islands, U.S.',
4517 'WF': 'Wallis and Futuna',
4518 'EH': 'Western Sahara',
4519 'YE': 'Yemen',
4520 'ZM': 'Zambia',
4521 'ZW': 'Zimbabwe',
2f97cc61 4522 # Not ISO 3166 codes, but used for IP blocks
4523 'AP': 'Asia/Pacific Region',
4524 'EU': 'Europe',
4eb10f66
YCH
4525 }
4526
4527 @classmethod
4528 def short2full(cls, code):
4529 """Convert an ISO 3166-2 country code to the corresponding full name"""
4530 return cls._country_map.get(code.upper())
4531
4532
86e5f3ed 4533class GeoUtils:
773f291d
S
4534 # Major IPv4 address blocks per country
4535 _country_ip_map = {
53896ca5 4536 'AD': '46.172.224.0/19',
773f291d
S
4537 'AE': '94.200.0.0/13',
4538 'AF': '149.54.0.0/17',
4539 'AG': '209.59.64.0/18',
4540 'AI': '204.14.248.0/21',
4541 'AL': '46.99.0.0/16',
4542 'AM': '46.70.0.0/15',
4543 'AO': '105.168.0.0/13',
53896ca5
S
4544 'AP': '182.50.184.0/21',
4545 'AQ': '23.154.160.0/24',
773f291d
S
4546 'AR': '181.0.0.0/12',
4547 'AS': '202.70.112.0/20',
53896ca5 4548 'AT': '77.116.0.0/14',
773f291d
S
4549 'AU': '1.128.0.0/11',
4550 'AW': '181.41.0.0/18',
53896ca5
S
4551 'AX': '185.217.4.0/22',
4552 'AZ': '5.197.0.0/16',
773f291d
S
4553 'BA': '31.176.128.0/17',
4554 'BB': '65.48.128.0/17',
4555 'BD': '114.130.0.0/16',
4556 'BE': '57.0.0.0/8',
53896ca5 4557 'BF': '102.178.0.0/15',
773f291d
S
4558 'BG': '95.42.0.0/15',
4559 'BH': '37.131.0.0/17',
4560 'BI': '154.117.192.0/18',
4561 'BJ': '137.255.0.0/16',
53896ca5 4562 'BL': '185.212.72.0/23',
773f291d
S
4563 'BM': '196.12.64.0/18',
4564 'BN': '156.31.0.0/16',
4565 'BO': '161.56.0.0/16',
4566 'BQ': '161.0.80.0/20',
53896ca5 4567 'BR': '191.128.0.0/12',
773f291d
S
4568 'BS': '24.51.64.0/18',
4569 'BT': '119.2.96.0/19',
4570 'BW': '168.167.0.0/16',
4571 'BY': '178.120.0.0/13',
4572 'BZ': '179.42.192.0/18',
4573 'CA': '99.224.0.0/11',
4574 'CD': '41.243.0.0/16',
53896ca5
S
4575 'CF': '197.242.176.0/21',
4576 'CG': '160.113.0.0/16',
773f291d 4577 'CH': '85.0.0.0/13',
53896ca5 4578 'CI': '102.136.0.0/14',
773f291d
S
4579 'CK': '202.65.32.0/19',
4580 'CL': '152.172.0.0/14',
53896ca5 4581 'CM': '102.244.0.0/14',
773f291d
S
4582 'CN': '36.128.0.0/10',
4583 'CO': '181.240.0.0/12',
4584 'CR': '201.192.0.0/12',
4585 'CU': '152.206.0.0/15',
4586 'CV': '165.90.96.0/19',
4587 'CW': '190.88.128.0/17',
53896ca5 4588 'CY': '31.153.0.0/16',
773f291d
S
4589 'CZ': '88.100.0.0/14',
4590 'DE': '53.0.0.0/8',
4591 'DJ': '197.241.0.0/17',
4592 'DK': '87.48.0.0/12',
4593 'DM': '192.243.48.0/20',
4594 'DO': '152.166.0.0/15',
4595 'DZ': '41.96.0.0/12',
4596 'EC': '186.68.0.0/15',
4597 'EE': '90.190.0.0/15',
4598 'EG': '156.160.0.0/11',
4599 'ER': '196.200.96.0/20',
4600 'ES': '88.0.0.0/11',
4601 'ET': '196.188.0.0/14',
4602 'EU': '2.16.0.0/13',
4603 'FI': '91.152.0.0/13',
4604 'FJ': '144.120.0.0/16',
53896ca5 4605 'FK': '80.73.208.0/21',
773f291d
S
4606 'FM': '119.252.112.0/20',
4607 'FO': '88.85.32.0/19',
4608 'FR': '90.0.0.0/9',
4609 'GA': '41.158.0.0/15',
4610 'GB': '25.0.0.0/8',
4611 'GD': '74.122.88.0/21',
4612 'GE': '31.146.0.0/16',
4613 'GF': '161.22.64.0/18',
4614 'GG': '62.68.160.0/19',
53896ca5
S
4615 'GH': '154.160.0.0/12',
4616 'GI': '95.164.0.0/16',
773f291d
S
4617 'GL': '88.83.0.0/19',
4618 'GM': '160.182.0.0/15',
4619 'GN': '197.149.192.0/18',
4620 'GP': '104.250.0.0/19',
4621 'GQ': '105.235.224.0/20',
4622 'GR': '94.64.0.0/13',
4623 'GT': '168.234.0.0/16',
4624 'GU': '168.123.0.0/16',
4625 'GW': '197.214.80.0/20',
4626 'GY': '181.41.64.0/18',
4627 'HK': '113.252.0.0/14',
4628 'HN': '181.210.0.0/16',
4629 'HR': '93.136.0.0/13',
4630 'HT': '148.102.128.0/17',
4631 'HU': '84.0.0.0/14',
4632 'ID': '39.192.0.0/10',
4633 'IE': '87.32.0.0/12',
4634 'IL': '79.176.0.0/13',
4635 'IM': '5.62.80.0/20',
4636 'IN': '117.192.0.0/10',
4637 'IO': '203.83.48.0/21',
4638 'IQ': '37.236.0.0/14',
4639 'IR': '2.176.0.0/12',
4640 'IS': '82.221.0.0/16',
4641 'IT': '79.0.0.0/10',
4642 'JE': '87.244.64.0/18',
4643 'JM': '72.27.0.0/17',
4644 'JO': '176.29.0.0/16',
53896ca5 4645 'JP': '133.0.0.0/8',
773f291d
S
4646 'KE': '105.48.0.0/12',
4647 'KG': '158.181.128.0/17',
4648 'KH': '36.37.128.0/17',
4649 'KI': '103.25.140.0/22',
4650 'KM': '197.255.224.0/20',
53896ca5 4651 'KN': '198.167.192.0/19',
773f291d
S
4652 'KP': '175.45.176.0/22',
4653 'KR': '175.192.0.0/10',
4654 'KW': '37.36.0.0/14',
4655 'KY': '64.96.0.0/15',
4656 'KZ': '2.72.0.0/13',
4657 'LA': '115.84.64.0/18',
4658 'LB': '178.135.0.0/16',
53896ca5 4659 'LC': '24.92.144.0/20',
773f291d
S
4660 'LI': '82.117.0.0/19',
4661 'LK': '112.134.0.0/15',
53896ca5 4662 'LR': '102.183.0.0/16',
773f291d
S
4663 'LS': '129.232.0.0/17',
4664 'LT': '78.56.0.0/13',
4665 'LU': '188.42.0.0/16',
4666 'LV': '46.109.0.0/16',
4667 'LY': '41.252.0.0/14',
4668 'MA': '105.128.0.0/11',
4669 'MC': '88.209.64.0/18',
4670 'MD': '37.246.0.0/16',
4671 'ME': '178.175.0.0/17',
4672 'MF': '74.112.232.0/21',
4673 'MG': '154.126.0.0/17',
4674 'MH': '117.103.88.0/21',
4675 'MK': '77.28.0.0/15',
4676 'ML': '154.118.128.0/18',
4677 'MM': '37.111.0.0/17',
4678 'MN': '49.0.128.0/17',
4679 'MO': '60.246.0.0/16',
4680 'MP': '202.88.64.0/20',
4681 'MQ': '109.203.224.0/19',
4682 'MR': '41.188.64.0/18',
4683 'MS': '208.90.112.0/22',
4684 'MT': '46.11.0.0/16',
4685 'MU': '105.16.0.0/12',
4686 'MV': '27.114.128.0/18',
53896ca5 4687 'MW': '102.70.0.0/15',
773f291d
S
4688 'MX': '187.192.0.0/11',
4689 'MY': '175.136.0.0/13',
4690 'MZ': '197.218.0.0/15',
4691 'NA': '41.182.0.0/16',
4692 'NC': '101.101.0.0/18',
4693 'NE': '197.214.0.0/18',
4694 'NF': '203.17.240.0/22',
4695 'NG': '105.112.0.0/12',
4696 'NI': '186.76.0.0/15',
4697 'NL': '145.96.0.0/11',
4698 'NO': '84.208.0.0/13',
4699 'NP': '36.252.0.0/15',
4700 'NR': '203.98.224.0/19',
4701 'NU': '49.156.48.0/22',
4702 'NZ': '49.224.0.0/14',
4703 'OM': '5.36.0.0/15',
4704 'PA': '186.72.0.0/15',
4705 'PE': '186.160.0.0/14',
4706 'PF': '123.50.64.0/18',
4707 'PG': '124.240.192.0/19',
4708 'PH': '49.144.0.0/13',
4709 'PK': '39.32.0.0/11',
4710 'PL': '83.0.0.0/11',
4711 'PM': '70.36.0.0/20',
4712 'PR': '66.50.0.0/16',
4713 'PS': '188.161.0.0/16',
4714 'PT': '85.240.0.0/13',
4715 'PW': '202.124.224.0/20',
4716 'PY': '181.120.0.0/14',
4717 'QA': '37.210.0.0/15',
53896ca5 4718 'RE': '102.35.0.0/16',
773f291d 4719 'RO': '79.112.0.0/13',
53896ca5 4720 'RS': '93.86.0.0/15',
773f291d 4721 'RU': '5.136.0.0/13',
53896ca5 4722 'RW': '41.186.0.0/16',
773f291d
S
4723 'SA': '188.48.0.0/13',
4724 'SB': '202.1.160.0/19',
4725 'SC': '154.192.0.0/11',
53896ca5 4726 'SD': '102.120.0.0/13',
773f291d 4727 'SE': '78.64.0.0/12',
53896ca5 4728 'SG': '8.128.0.0/10',
773f291d
S
4729 'SI': '188.196.0.0/14',
4730 'SK': '78.98.0.0/15',
53896ca5 4731 'SL': '102.143.0.0/17',
773f291d
S
4732 'SM': '89.186.32.0/19',
4733 'SN': '41.82.0.0/15',
53896ca5 4734 'SO': '154.115.192.0/18',
773f291d
S
4735 'SR': '186.179.128.0/17',
4736 'SS': '105.235.208.0/21',
4737 'ST': '197.159.160.0/19',
4738 'SV': '168.243.0.0/16',
4739 'SX': '190.102.0.0/20',
4740 'SY': '5.0.0.0/16',
4741 'SZ': '41.84.224.0/19',
4742 'TC': '65.255.48.0/20',
4743 'TD': '154.68.128.0/19',
4744 'TG': '196.168.0.0/14',
4745 'TH': '171.96.0.0/13',
4746 'TJ': '85.9.128.0/18',
4747 'TK': '27.96.24.0/21',
4748 'TL': '180.189.160.0/20',
4749 'TM': '95.85.96.0/19',
4750 'TN': '197.0.0.0/11',
4751 'TO': '175.176.144.0/21',
4752 'TR': '78.160.0.0/11',
4753 'TT': '186.44.0.0/15',
4754 'TV': '202.2.96.0/19',
4755 'TW': '120.96.0.0/11',
4756 'TZ': '156.156.0.0/14',
53896ca5
S
4757 'UA': '37.52.0.0/14',
4758 'UG': '102.80.0.0/13',
4759 'US': '6.0.0.0/8',
773f291d 4760 'UY': '167.56.0.0/13',
53896ca5 4761 'UZ': '84.54.64.0/18',
773f291d 4762 'VA': '212.77.0.0/19',
53896ca5 4763 'VC': '207.191.240.0/21',
773f291d 4764 'VE': '186.88.0.0/13',
53896ca5 4765 'VG': '66.81.192.0/20',
773f291d
S
4766 'VI': '146.226.0.0/16',
4767 'VN': '14.160.0.0/11',
4768 'VU': '202.80.32.0/20',
4769 'WF': '117.20.32.0/21',
4770 'WS': '202.4.32.0/19',
4771 'YE': '134.35.0.0/16',
4772 'YT': '41.242.116.0/22',
4773 'ZA': '41.0.0.0/11',
53896ca5
S
4774 'ZM': '102.144.0.0/13',
4775 'ZW': '102.177.192.0/18',
773f291d
S
4776 }
4777
4778 @classmethod
5f95927a
S
4779 def random_ipv4(cls, code_or_block):
4780 if len(code_or_block) == 2:
4781 block = cls._country_ip_map.get(code_or_block.upper())
4782 if not block:
4783 return None
4784 else:
4785 block = code_or_block
773f291d 4786 addr, preflen = block.split('/')
ac668111 4787 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4788 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4789 return str(socket.inet_ntoa(
ac668111 4790 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4791
4792
ac668111 4793class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4794 def __init__(self, proxies=None):
4795 # Set default handlers
4796 for type in ('http', 'https'):
4797 setattr(self, '%s_open' % type,
4798 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4799 meth(r, proxy, type))
ac668111 4800 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4801
91410c9b 4802 def proxy_open(self, req, proxy, type):
2461f79d 4803 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4804 if req_proxy is not None:
4805 proxy = req_proxy
2461f79d
PH
4806 del req.headers['Ytdl-request-proxy']
4807
4808 if proxy == '__noproxy__':
4809 return None # No Proxy
14f25df2 4810 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4811 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4812 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4813 return None
ac668111 4814 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4815 self, req, proxy, type)
5bc880b9
YCH
4816
4817
0a5445dd
YCH
4818# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4819# released into Public Domain
4820# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4821
4822def long_to_bytes(n, blocksize=0):
4823 """long_to_bytes(n:long, blocksize:int) : string
4824 Convert a long integer to a byte string.
4825
4826 If optional blocksize is given and greater than zero, pad the front of the
4827 byte string with binary zeros so that the length is a multiple of
4828 blocksize.
4829 """
4830 # after much testing, this algorithm was deemed to be the fastest
4831 s = b''
4832 n = int(n)
4833 while n > 0:
ac668111 4834 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4835 n = n >> 32
4836 # strip off leading zeros
4837 for i in range(len(s)):
4838 if s[i] != b'\000'[0]:
4839 break
4840 else:
4841 # only happens when n == 0
4842 s = b'\000'
4843 i = 0
4844 s = s[i:]
4845 # add back some pad bytes. this could be done more efficiently w.r.t. the
4846 # de-padding being done above, but sigh...
4847 if blocksize > 0 and len(s) % blocksize:
4848 s = (blocksize - len(s) % blocksize) * b'\000' + s
4849 return s
4850
4851
4852def bytes_to_long(s):
4853 """bytes_to_long(string) : long
4854 Convert a byte string to a long integer.
4855
4856 This is (essentially) the inverse of long_to_bytes().
4857 """
4858 acc = 0
4859 length = len(s)
4860 if length % 4:
4861 extra = (4 - length % 4)
4862 s = b'\000' * extra + s
4863 length = length + extra
4864 for i in range(0, length, 4):
ac668111 4865 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4866 return acc
4867
4868
5bc880b9
YCH
4869def ohdave_rsa_encrypt(data, exponent, modulus):
4870 '''
4871 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4872
4873 Input:
4874 data: data to encrypt, bytes-like object
4875 exponent, modulus: parameter e and N of RSA algorithm, both integer
4876 Output: hex string of encrypted data
4877
4878 Limitation: supports one block encryption only
4879 '''
4880
4881 payload = int(binascii.hexlify(data[::-1]), 16)
4882 encrypted = pow(payload, exponent, modulus)
4883 return '%x' % encrypted
81bdc8fd
YCH
4884
4885
f48409c7
YCH
4886def pkcs1pad(data, length):
4887 """
4888 Padding input data with PKCS#1 scheme
4889
4890 @param {int[]} data input data
4891 @param {int} length target length
4892 @returns {int[]} padded data
4893 """
4894 if len(data) > length - 11:
4895 raise ValueError('Input data too long for PKCS#1 padding')
4896
4897 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4898 return [0, 2] + pseudo_random + [0] + data
4899
4900
7b2c3f47 4901def _base_n_table(n, table):
4902 if not table and not n:
4903 raise ValueError('Either table or n must be specified')
612f2be5 4904 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4905
44f14eb4 4906 if n and n != len(table):
612f2be5 4907 raise ValueError(f'base {n} exceeds table length {len(table)}')
4908 return table
59f898b7 4909
5eb6bdce 4910
7b2c3f47 4911def encode_base_n(num, n=None, table=None):
4912 """Convert given int to a base-n string"""
612f2be5 4913 table = _base_n_table(n, table)
7b2c3f47 4914 if not num:
5eb6bdce
YCH
4915 return table[0]
4916
7b2c3f47 4917 result, base = '', len(table)
81bdc8fd 4918 while num:
7b2c3f47 4919 result = table[num % base] + result
612f2be5 4920 num = num // base
7b2c3f47 4921 return result
4922
4923
4924def decode_base_n(string, n=None, table=None):
4925 """Convert given base-n string to int"""
4926 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4927 result, base = 0, len(table)
4928 for char in string:
4929 result = result * base + table[char]
4930 return result
4931
4932
4933def decode_base(value, digits):
da4db748 4934 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4935 f'in a future version. Use {__name__}.decode_base_n instead')
7b2c3f47 4936 return decode_base_n(value, table=digits)
f52354a8
YCH
4937
4938
4939def decode_packed_codes(code):
06b3fe29 4940 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4941 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4942 base = int(base)
4943 count = int(count)
4944 symbols = symbols.split('|')
4945 symbol_table = {}
4946
4947 while count:
4948 count -= 1
5eb6bdce 4949 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4950 symbol_table[base_n_count] = symbols[count] or base_n_count
4951
4952 return re.sub(
4953 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4954 obfuscated_code)
e154c651 4955
4956
1ced2221
S
4957def caesar(s, alphabet, shift):
4958 if shift == 0:
4959 return s
4960 l = len(alphabet)
4961 return ''.join(
4962 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4963 for c in s)
4964
4965
4966def rot47(s):
4967 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4968
4969
e154c651 4970def parse_m3u8_attributes(attrib):
4971 info = {}
4972 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4973 if val.startswith('"'):
4974 val = val[1:-1]
4975 info[key] = val
4976 return info
1143535d
YCH
4977
4978
4979def urshift(val, n):
4980 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4981
4982
4983# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4984# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4985def decode_png(png_data):
4986 # Reference: https://www.w3.org/TR/PNG/
4987 header = png_data[8:]
4988
4989 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4990 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4991
4992 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 4993 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
4994
4995 chunks = []
4996
4997 while header:
4998 length = unpack_integer(header[:4])
4999 header = header[4:]
5000
5001 chunk_type = header[:4]
5002 header = header[4:]
5003
5004 chunk_data = header[:length]
5005 header = header[length:]
5006
5007 header = header[4:] # Skip CRC
5008
5009 chunks.append({
5010 'type': chunk_type,
5011 'length': length,
5012 'data': chunk_data
5013 })
5014
5015 ihdr = chunks[0]['data']
5016
5017 width = unpack_integer(ihdr[:4])
5018 height = unpack_integer(ihdr[4:8])
5019
5020 idat = b''
5021
5022 for chunk in chunks:
5023 if chunk['type'] == b'IDAT':
5024 idat += chunk['data']
5025
5026 if not idat:
86e5f3ed 5027 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
5028
5029 decompressed_data = bytearray(zlib.decompress(idat))
5030
5031 stride = width * 3
5032 pixels = []
5033
5034 def _get_pixel(idx):
5035 x = idx % stride
5036 y = idx // stride
5037 return pixels[y][x]
5038
5039 for y in range(height):
5040 basePos = y * (1 + stride)
5041 filter_type = decompressed_data[basePos]
5042
5043 current_row = []
5044
5045 pixels.append(current_row)
5046
5047 for x in range(stride):
5048 color = decompressed_data[1 + basePos + x]
5049 basex = y * stride + x
5050 left = 0
5051 up = 0
5052
5053 if x > 2:
5054 left = _get_pixel(basex - 3)
5055 if y > 0:
5056 up = _get_pixel(basex - stride)
5057
5058 if filter_type == 1: # Sub
5059 color = (color + left) & 0xff
5060 elif filter_type == 2: # Up
5061 color = (color + up) & 0xff
5062 elif filter_type == 3: # Average
5063 color = (color + ((left + up) >> 1)) & 0xff
5064 elif filter_type == 4: # Paeth
5065 a = left
5066 b = up
5067 c = 0
5068
5069 if x > 2 and y > 0:
5070 c = _get_pixel(basex - stride - 3)
5071
5072 p = a + b - c
5073
5074 pa = abs(p - a)
5075 pb = abs(p - b)
5076 pc = abs(p - c)
5077
5078 if pa <= pb and pa <= pc:
5079 color = (color + a) & 0xff
5080 elif pb <= pc:
5081 color = (color + b) & 0xff
5082 else:
5083 color = (color + c) & 0xff
5084
5085 current_row.append(color)
5086
5087 return width, height, pixels
efa97bdc
YCH
5088
5089
5090def write_xattr(path, key, value):
6f7563be 5091 # Windows: Write xattrs to NTFS Alternate Data Streams:
5092 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5093 if compat_os_name == 'nt':
5094 assert ':' not in key
5095 assert os.path.exists(path)
efa97bdc
YCH
5096
5097 try:
6f7563be 5098 with open(f'{path}:{key}', 'wb') as f:
5099 f.write(value)
86e5f3ed 5100 except OSError as e:
efa97bdc 5101 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5102 return
efa97bdc 5103
6f7563be 5104 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5105
6f7563be 5106 setxattr = None
5107 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5108 # Unicode arguments are not supported in pyxattr until version 0.5.0
5109 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5110 if version_tuple(xattr.__version__) >= (0, 5, 0):
5111 setxattr = xattr.set
5112 elif xattr:
5113 setxattr = xattr.setxattr
efa97bdc 5114
6f7563be 5115 if setxattr:
5116 try:
5117 setxattr(path, key, value)
5118 except OSError as e:
5119 raise XAttrMetadataError(e.errno, e.strerror)
5120 return
efa97bdc 5121
6f7563be 5122 # UNIX Method 2. Use setfattr/xattr executables
5123 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5124 else 'xattr' if check_executable('xattr', ['-h']) else None)
5125 if not exe:
5126 raise XAttrUnavailableError(
5127 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5128 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5129
0f06bcd7 5130 value = value.decode()
6f7563be 5131 try:
f0c9fb96 5132 _, stderr, returncode = Popen.run(
6f7563be 5133 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5134 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5135 except OSError as e:
5136 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5137 if returncode:
5138 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5139
5140
5141def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5142 start_date = datetime.date(1950, 1, 1)
5143 end_date = datetime.date(1995, 12, 31)
5144 offset = random.randint(0, (end_date - start_date).days)
5145 random_date = start_date + datetime.timedelta(offset)
0c265486 5146 return {
aa374bc7
AS
5147 year_field: str(random_date.year),
5148 month_field: str(random_date.month),
5149 day_field: str(random_date.day),
0c265486 5150 }
732044af 5151
c76eb41b 5152
732044af 5153# Templates for internet shortcut files, which are plain text files.
e5a998f3 5154DOT_URL_LINK_TEMPLATE = '''\
732044af 5155[InternetShortcut]
5156URL=%(url)s
e5a998f3 5157'''
732044af 5158
e5a998f3 5159DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5160<?xml version="1.0" encoding="UTF-8"?>
5161<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5162<plist version="1.0">
5163<dict>
5164\t<key>URL</key>
5165\t<string>%(url)s</string>
5166</dict>
5167</plist>
e5a998f3 5168'''
732044af 5169
e5a998f3 5170DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5171[Desktop Entry]
5172Encoding=UTF-8
5173Name=%(filename)s
5174Type=Link
5175URL=%(url)s
5176Icon=text-html
e5a998f3 5177'''
732044af 5178
08438d2c 5179LINK_TEMPLATES = {
5180 'url': DOT_URL_LINK_TEMPLATE,
5181 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5182 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5183}
5184
732044af 5185
5186def iri_to_uri(iri):
5187 """
5188 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5189
5190 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5191 """
5192
14f25df2 5193 iri_parts = urllib.parse.urlparse(iri)
732044af 5194
5195 if '[' in iri_parts.netloc:
5196 raise ValueError('IPv6 URIs are not, yet, supported.')
5197 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5198
5199 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5200
5201 net_location = ''
5202 if iri_parts.username:
f9934b96 5203 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5204 if iri_parts.password is not None:
f9934b96 5205 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5206 net_location += '@'
5207
0f06bcd7 5208 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5209 # The 'idna' encoding produces ASCII text.
5210 if iri_parts.port is not None and iri_parts.port != 80:
5211 net_location += ':' + str(iri_parts.port)
5212
f9934b96 5213 return urllib.parse.urlunparse(
732044af 5214 (iri_parts.scheme,
5215 net_location,
5216
f9934b96 5217 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5218
5219 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5220 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5221
5222 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5223 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5224
f9934b96 5225 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5226
5227 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5228
5229
5230def to_high_limit_path(path):
5231 if sys.platform in ['win32', 'cygwin']:
5232 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5233 return '\\\\?\\' + os.path.abspath(path)
732044af 5234
5235 return path
76d321f6 5236
c76eb41b 5237
7b2c3f47 5238def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5239 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5240 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5241 return default
7b2c3f47 5242 return template % func(val)
00dd0cd5 5243
5244
5245def clean_podcast_url(url):
5246 return re.sub(r'''(?x)
5247 (?:
5248 (?:
5249 chtbl\.com/track|
5250 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5251 play\.podtrac\.com
5252 )/[^/]+|
5253 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5254 flex\.acast\.com|
5255 pd(?:
5256 cn\.co| # https://podcorn.com/analytics-prefix/
5257 st\.fm # https://podsights.com/docs/
5258 )/e
5259 )/''', '', url)
ffcb8191
THD
5260
5261
5262_HEX_TABLE = '0123456789abcdef'
5263
5264
5265def random_uuidv4():
5266 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5267
5268
5269def make_dir(path, to_screen=None):
5270 try:
5271 dn = os.path.dirname(path)
5272 if dn and not os.path.exists(dn):
5273 os.makedirs(dn)
5274 return True
86e5f3ed 5275 except OSError as err:
0202b52a 5276 if callable(to_screen) is not None:
5277 to_screen('unable to create directory ' + error_to_compat_str(err))
5278 return False
f74980cb 5279
5280
5281def get_executable_path():
b5899f4f 5282 from .update import _get_variant_and_executable_path
c487cf00 5283
b5899f4f 5284 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5285
5286
2f567473 5287def load_plugins(name, suffix, namespace):
3ae5e797 5288 classes = {}
19a03940 5289 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5290 plugins_spec = importlib.util.spec_from_file_location(
5291 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5292 plugins = importlib.util.module_from_spec(plugins_spec)
5293 sys.modules[plugins_spec.name] = plugins
5294 plugins_spec.loader.exec_module(plugins)
f74980cb 5295 for name in dir(plugins):
2f567473 5296 if name in namespace:
5297 continue
5298 if not name.endswith(suffix):
f74980cb 5299 continue
5300 klass = getattr(plugins, name)
3ae5e797 5301 classes[name] = namespace[name] = klass
f74980cb 5302 return classes
06167fbb 5303
5304
325ebc17 5305def traverse_obj(
f99bbfc9 5306 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
325ebc17 5307 casesense=True, is_user_input=False, traverse_string=False):
ab029d7e
SS
5308 """
5309 Safely traverse nested `dict`s and `Sequence`s
5310
5311 >>> obj = [{}, {"key": "value"}]
5312 >>> traverse_obj(obj, (1, "key"))
5313 "value"
5314
5315 Each of the provided `paths` is tested and the first producing a valid result will be returned.
f99bbfc9 5316 The next path will also be tested if the path branched but no results could be found.
7b0127e1 5317 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
ab029d7e
SS
5318 A value of None is treated as the absence of a value.
5319
5320 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5321
5322 The keys in the path can be one of:
5323 - `None`: Return the current object.
7b0127e1 5324 - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`.
ab029d7e
SS
5325 - `slice`: Branch out and return all values in `obj[key]`.
5326 - `Ellipsis`: Branch out and return a list of all values.
5327 - `tuple`/`list`: Branch out and return a list of all matching values.
5328 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5329 - `function`: Branch out and return values filtered by the function.
5330 Read as: `[value for key, value in obj if function(key, value)]`.
5331 For `Sequence`s, `key` is the index of the value.
5332 - `dict` Transform the current object and return a matching dict.
5333 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5334
7b0127e1 5335 `tuple`, `list`, and `dict` all support nested paths and branches.
ab029d7e
SS
5336
5337 @params paths Paths which to traverse by.
5338 @param default Value to return if the paths do not match.
5339 @param expected_type If a `type`, only accept final values of this type.
5340 If any other callable, try to call the function on each result.
5341 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5342 @param casesense If `False`, consider string dictionary keys as case insensitive.
5343
5344 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5345
5346 @param is_user_input Whether the keys are generated from user input.
5347 If `True` strings get converted to `int`/`slice` if needed.
5348 @param traverse_string Whether to traverse into objects as strings.
5349 If `True`, any non-compatible object will first be
5350 converted into a string and then traversed into.
5351
5352
5353 @returns The result of the object traversal.
5354 If successful, `get_all=True`, and the path branches at least once,
5355 then a list of results is returned instead.
f99bbfc9 5356 A list is always returned if the last path branches and no `default` is given.
ab029d7e
SS
5357 """
5358 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5359 casefold = lambda k: k.casefold() if isinstance(k, str) else k
325ebc17 5360
352d63fd 5361 if isinstance(expected_type, type):
5362 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5363 else:
ab029d7e
SS
5364 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5365
5366 def apply_key(key, obj):
5367 if obj is None:
5368 return
5369
5370 elif key is None:
5371 yield obj
5372
5373 elif isinstance(key, (list, tuple)):
5374 for branch in key:
5375 _, result = apply_path(obj, branch)
5376 yield from result
5377
5378 elif key is ...:
5379 if isinstance(obj, collections.abc.Mapping):
5380 yield from obj.values()
5381 elif is_sequence(obj):
5382 yield from obj
7b0127e1
SS
5383 elif isinstance(obj, re.Match):
5384 yield from obj.groups()
ab029d7e
SS
5385 elif traverse_string:
5386 yield from str(obj)
5387
5388 elif callable(key):
5389 if is_sequence(obj):
5390 iter_obj = enumerate(obj)
5391 elif isinstance(obj, collections.abc.Mapping):
5392 iter_obj = obj.items()
7b0127e1
SS
5393 elif isinstance(obj, re.Match):
5394 iter_obj = enumerate((obj.group(), *obj.groups()))
ab029d7e
SS
5395 elif traverse_string:
5396 iter_obj = enumerate(str(obj))
352d63fd 5397 else:
ab029d7e
SS
5398 return
5399 yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5400
5401 elif isinstance(key, dict):
5402 iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5403 yield {k: v if v is not None else default for k, v in iter_obj
f99bbfc9 5404 if v is not None or default is not NO_DEFAULT}
ab029d7e 5405
7b0127e1 5406 elif isinstance(obj, collections.abc.Mapping):
ab029d7e
SS
5407 yield (obj.get(key) if casesense or (key in obj)
5408 else next((v for k, v in obj.items() if casefold(k) == key), None))
5409
7b0127e1
SS
5410 elif isinstance(obj, re.Match):
5411 if isinstance(key, int) or casesense:
5412 with contextlib.suppress(IndexError):
5413 yield obj.group(key)
5414 return
5415
5416 if not isinstance(key, str):
5417 return
5418
5419 yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5420
ab029d7e
SS
5421 else:
5422 if is_user_input:
5423 key = (int_or_none(key) if ':' not in key
5424 else slice(*map(int_or_none, key.split(':'))))
5425
5426 if not isinstance(key, (int, slice)):
5427 return
5428
5429 if not is_sequence(obj):
5430 if not traverse_string:
5431 return
5432 obj = str(obj)
5433
5434 with contextlib.suppress(IndexError):
5435 yield obj[key]
5436
5437 def apply_path(start_obj, path):
5438 objs = (start_obj,)
5439 has_branched = False
5440
5441 for key in variadic(path):
5442 if is_user_input and key == ':':
5443 key = ...
5444
5445 if not casesense and isinstance(key, str):
5446 key = key.casefold()
5447
5448 if key is ... or isinstance(key, (list, tuple)) or callable(key):
5449 has_branched = True
5450
5451 key_func = functools.partial(apply_key, key)
5452 objs = itertools.chain.from_iterable(map(key_func, objs))
5453
5454 return has_branched, objs
5455
f99bbfc9 5456 def _traverse_obj(obj, path, use_list=True):
ab029d7e
SS
5457 has_branched, results = apply_path(obj, path)
5458 results = LazyList(x for x in map(type_test, results) if x is not None)
ab029d7e 5459
f99bbfc9
SS
5460 if get_all and has_branched:
5461 return results.exhaust() if results or use_list else None
5462
5463 return results[0] if results else None
5464
5465 for index, path in enumerate(paths, 1):
5466 use_list = default is NO_DEFAULT and index == len(paths)
5467 result = _traverse_obj(obj, path, use_list)
ab029d7e
SS
5468 if result is not None:
5469 return result
5470
f99bbfc9 5471 return None if default is NO_DEFAULT else default
324ad820 5472
5473
5474def traverse_dict(dictn, keys, casesense=True):
da4db748 5475 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5476 f'in a future version. Use "{__name__}.traverse_obj" instead')
ee8dd27a 5477 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5478
5479
ff91cf74 5480def get_first(obj, keys, **kwargs):
5481 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5482
5483
3e9b66d7
LNO
5484def time_seconds(**kwargs):
5485 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5486 return t.timestamp()
5487
5488
49fa4d9a
N
5489# create a JSON Web Signature (jws) with HS256 algorithm
5490# the resulting format is in JWS Compact Serialization
5491# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5492# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5493def jwt_encode_hs256(payload_data, key, headers={}):
5494 header_data = {
5495 'alg': 'HS256',
5496 'typ': 'JWT',
5497 }
5498 if headers:
5499 header_data.update(headers)
0f06bcd7 5500 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5501 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5502 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5503 signature_b64 = base64.b64encode(h.digest())
5504 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5505 return token
819e0531 5506
5507
16b0d7e6 5508# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5509def jwt_decode_hs256(jwt):
5510 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5511 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5512 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5513 return payload_data
5514
5515
53973b4d 5516WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5517
5518
7a32c70d 5519@functools.cache
819e0531 5520def supports_terminal_sequences(stream):
5521 if compat_os_name == 'nt':
8a82af35 5522 if not WINDOWS_VT_MODE:
819e0531 5523 return False
5524 elif not os.getenv('TERM'):
5525 return False
5526 try:
5527 return stream.isatty()
5528 except BaseException:
5529 return False
5530
5531
53973b4d 5532def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5533 if get_windows_version() < (10, 0, 10586):
53973b4d 5534 return
5535 global WINDOWS_VT_MODE
53973b4d 5536 try:
f0c9fb96 5537 Popen.run('', shell=True)
53973b4d 5538 except Exception:
5539 return
5540
5541 WINDOWS_VT_MODE = True
5542 supports_terminal_sequences.cache_clear()
5543
5544
ec11a9f4 5545_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5546
5547
5548def remove_terminal_sequences(string):
5549 return _terminal_sequences_re.sub('', string)
5550
5551
5552def number_of_digits(number):
5553 return len('%d' % number)
34921b43 5554
5555
5556def join_nonempty(*values, delim='-', from_dict=None):
5557 if from_dict is not None:
7b2c3f47 5558 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5559 return delim.join(map(str, filter(None, values)))
06e57990 5560
5561
27231526
ZM
5562def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5563 """
5564 Find the largest format dimensions in terms of video width and, for each thumbnail:
5565 * Modify the URL: Match the width with the provided regex and replace with the former width
5566 * Update dimensions
5567
5568 This function is useful with video services that scale the provided thumbnails on demand
5569 """
5570 _keys = ('width', 'height')
5571 max_dimensions = max(
86e5f3ed 5572 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5573 default=(0, 0))
5574 if not max_dimensions[0]:
5575 return thumbnails
5576 return [
5577 merge_dicts(
5578 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5579 dict(zip(_keys, max_dimensions)), thumbnail)
5580 for thumbnail in thumbnails
5581 ]
5582
5583
93c8410d
LNO
5584def parse_http_range(range):
5585 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5586 if not range:
5587 return None, None, None
5588 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5589 if not crg:
5590 return None, None, None
5591 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5592
5593
6b9e832d 5594def read_stdin(what):
5595 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5596 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5597 return sys.stdin
5598
5599
a904a7f8
L
5600def determine_file_encoding(data):
5601 """
88f60feb 5602 Detect the text encoding used
a904a7f8
L
5603 @returns (encoding, bytes to skip)
5604 """
5605
88f60feb 5606 # BOM marks are given priority over declarations
a904a7f8 5607 for bom, enc in BOMS:
a904a7f8
L
5608 if data.startswith(bom):
5609 return enc, len(bom)
5610
88f60feb 5611 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5612 # We ignore the endianness to get a good enough match
a904a7f8 5613 data = data.replace(b'\0', b'')
88f60feb 5614 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5615 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5616
5617
06e57990 5618class Config:
5619 own_args = None
9e491463 5620 parsed_args = None
06e57990 5621 filename = None
5622 __initialized = False
5623
5624 def __init__(self, parser, label=None):
9e491463 5625 self.parser, self.label = parser, label
06e57990 5626 self._loaded_paths, self.configs = set(), []
5627
5628 def init(self, args=None, filename=None):
5629 assert not self.__initialized
284a60c5 5630 self.own_args, self.filename = args, filename
5631 return self.load_configs()
5632
5633 def load_configs(self):
65662dff 5634 directory = ''
284a60c5 5635 if self.filename:
5636 location = os.path.realpath(self.filename)
65662dff 5637 directory = os.path.dirname(location)
06e57990 5638 if location in self._loaded_paths:
5639 return False
5640 self._loaded_paths.add(location)
5641
284a60c5 5642 self.__initialized = True
5643 opts, _ = self.parser.parse_known_args(self.own_args)
5644 self.parsed_args = self.own_args
9e491463 5645 for location in opts.config_locations or []:
6b9e832d 5646 if location == '-':
1060f82f 5647 if location in self._loaded_paths:
5648 continue
5649 self._loaded_paths.add(location)
6b9e832d 5650 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5651 continue
65662dff 5652 location = os.path.join(directory, expand_path(location))
06e57990 5653 if os.path.isdir(location):
5654 location = os.path.join(location, 'yt-dlp.conf')
5655 if not os.path.exists(location):
9e491463 5656 self.parser.error(f'config location {location} does not exist')
06e57990 5657 self.append_config(self.read_file(location), location)
5658 return True
5659
5660 def __str__(self):
5661 label = join_nonempty(
5662 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5663 delim=' ')
5664 return join_nonempty(
5665 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5666 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5667 delim='\n')
5668
7a32c70d 5669 @staticmethod
06e57990 5670 def read_file(filename, default=[]):
5671 try:
a904a7f8 5672 optionf = open(filename, 'rb')
86e5f3ed 5673 except OSError:
06e57990 5674 return default # silently skip if file is not present
a904a7f8
L
5675 try:
5676 enc, skip = determine_file_encoding(optionf.read(512))
5677 optionf.seek(skip, io.SEEK_SET)
5678 except OSError:
5679 enc = None # silently skip read errors
06e57990 5680 try:
5681 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5682 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5683 res = shlex.split(contents, comments=True)
44a6fcff 5684 except Exception as err:
5685 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5686 finally:
5687 optionf.close()
5688 return res
5689
7a32c70d 5690 @staticmethod
06e57990 5691 def hide_login_info(opts):
86e5f3ed 5692 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5693 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5694
5695 def _scrub_eq(o):
5696 m = eqre.match(o)
5697 if m:
5698 return m.group('key') + '=PRIVATE'
5699 else:
5700 return o
5701
5702 opts = list(map(_scrub_eq, opts))
5703 for idx, opt in enumerate(opts):
5704 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5705 opts[idx + 1] = 'PRIVATE'
5706 return opts
5707
5708 def append_config(self, *args, label=None):
9e491463 5709 config = type(self)(self.parser, label)
06e57990 5710 config._loaded_paths = self._loaded_paths
5711 if config.init(*args):
5712 self.configs.append(config)
5713
7a32c70d 5714 @property
06e57990 5715 def all_args(self):
5716 for config in reversed(self.configs):
5717 yield from config.all_args
9e491463 5718 yield from self.parsed_args or []
5719
5720 def parse_known_args(self, **kwargs):
5721 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5722
5723 def parse_args(self):
9e491463 5724 return self.parser.parse_args(self.all_args)
da42679b
LNO
5725
5726
d5d1df8a 5727class WebSocketsWrapper:
da42679b 5728 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5729 pool = None
da42679b 5730
3cea3edd 5731 def __init__(self, url, headers=None, connect=True):
059bc4db 5732 self.loop = asyncio.new_event_loop()
9cd08050 5733 # XXX: "loop" is deprecated
5734 self.conn = websockets.connect(
5735 url, extra_headers=headers, ping_interval=None,
5736 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5737 if connect:
5738 self.__enter__()
15dfb392 5739 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5740
5741 def __enter__(self):
3cea3edd 5742 if not self.pool:
9cd08050 5743 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5744 return self
5745
5746 def send(self, *args):
5747 self.run_with_loop(self.pool.send(*args), self.loop)
5748
5749 def recv(self, *args):
5750 return self.run_with_loop(self.pool.recv(*args), self.loop)
5751
5752 def __exit__(self, type, value, traceback):
5753 try:
5754 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5755 finally:
5756 self.loop.close()
15dfb392 5757 self._cancel_all_tasks(self.loop)
da42679b
LNO
5758
5759 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5760 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 5761 @staticmethod
da42679b 5762 def run_with_loop(main, loop):
059bc4db 5763 if not asyncio.iscoroutine(main):
da42679b
LNO
5764 raise ValueError(f'a coroutine was expected, got {main!r}')
5765
5766 try:
5767 return loop.run_until_complete(main)
5768 finally:
5769 loop.run_until_complete(loop.shutdown_asyncgens())
5770 if hasattr(loop, 'shutdown_default_executor'):
5771 loop.run_until_complete(loop.shutdown_default_executor())
5772
7a32c70d 5773 @staticmethod
da42679b 5774 def _cancel_all_tasks(loop):
059bc4db 5775 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5776
5777 if not to_cancel:
5778 return
5779
5780 for task in to_cancel:
5781 task.cancel()
5782
9cd08050 5783 # XXX: "loop" is removed in python 3.10+
da42679b 5784 loop.run_until_complete(
059bc4db 5785 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5786
5787 for task in to_cancel:
5788 if task.cancelled():
5789 continue
5790 if task.exception() is not None:
5791 loop.call_exception_handler({
5792 'message': 'unhandled exception during asyncio.run() shutdown',
5793 'exception': task.exception(),
5794 'task': task,
5795 })
5796
5797
8b7539d2 5798def merge_headers(*dicts):
08d30158 5799 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5800 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5801
5802
b1f94422 5803def cached_method(f):
5804 """Cache a method"""
5805 signature = inspect.signature(f)
5806
7a32c70d 5807 @functools.wraps(f)
b1f94422 5808 def wrapper(self, *args, **kwargs):
5809 bound_args = signature.bind(self, *args, **kwargs)
5810 bound_args.apply_defaults()
d5d1df8a 5811 key = tuple(bound_args.arguments.values())[1:]
b1f94422 5812
d5d1df8a 5813 cache = vars(self).setdefault('__cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 5814 if key not in cache:
5815 cache[key] = f(self, *args, **kwargs)
5816 return cache[key]
5817 return wrapper
5818
5819
28787f16 5820class classproperty:
b1f94422 5821 """property access for class methods"""
c487cf00 5822
5823 def __init__(self, func):
5824 functools.update_wrapper(self, func)
5825 self.func = func
28787f16 5826
5827 def __get__(self, _, cls):
c487cf00 5828 return self.func(cls)
19a03940 5829
5830
64fa820c 5831class Namespace(types.SimpleNamespace):
591bb9d3 5832 """Immutable namespace"""
591bb9d3 5833
7896214c 5834 def __iter__(self):
64fa820c 5835 return iter(self.__dict__.values())
7896214c 5836
7a32c70d 5837 @property
64fa820c 5838 def items_(self):
5839 return self.__dict__.items()
9b8ee23b 5840
5841
8dc59305 5842MEDIA_EXTENSIONS = Namespace(
5843 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5844 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5845 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5846 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5847 thumbnails=('jpg', 'png', 'webp'),
5848 storyboards=('mhtml', ),
5849 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5850 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5851)
5852MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5853MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5854
5855KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5856
5857
be5c1ae8 5858class RetryManager:
5859 """Usage:
5860 for retry in RetryManager(...):
5861 try:
5862 ...
5863 except SomeException as err:
5864 retry.error = err
5865 continue
5866 """
5867 attempt, _error = 0, None
5868
5869 def __init__(self, _retries, _error_callback, **kwargs):
5870 self.retries = _retries or 0
5871 self.error_callback = functools.partial(_error_callback, **kwargs)
5872
5873 def _should_retry(self):
5874 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5875
7a32c70d 5876 @property
be5c1ae8 5877 def error(self):
5878 if self._error is NO_DEFAULT:
5879 return None
5880 return self._error
5881
7a32c70d 5882 @error.setter
be5c1ae8 5883 def error(self, value):
5884 self._error = value
5885
5886 def __iter__(self):
5887 while self._should_retry():
5888 self.error = NO_DEFAULT
5889 self.attempt += 1
5890 yield self
5891 if self.error:
5892 self.error_callback(self.error, self.attempt, self.retries)
5893
7a32c70d 5894 @staticmethod
be5c1ae8 5895 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5896 """Utility function for reporting retries"""
5897 if count > retries:
5898 if error:
5899 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5900 raise e
5901
5902 if not count:
5903 return warn(e)
5904 elif isinstance(e, ExtractorError):
3ce29336 5905 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5906 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5907
5908 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5909 if delay:
5910 info(f'Sleeping {delay:.2f} seconds ...')
5911 time.sleep(delay)
5912
5913
0647d925 5914def make_archive_id(ie, video_id):
5915 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5916 return f'{ie_key.lower()} {video_id}'
5917
5918
a1c5bd82 5919def truncate_string(s, left, right=0):
5920 assert left > 3 and right >= 0
5921 if s is None or len(s) <= left + right:
5922 return s
5923 return f'{s[:left-3]}...{s[-right:]}'
5924
5925
5314b521 5926def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5927 assert 'all' in alias_dict, '"all" alias is required'
5928 requested = list(start or [])
5929 for val in options:
5930 discard = val.startswith('-')
5931 if discard:
5932 val = val[1:]
5933
5934 if val in alias_dict:
5935 val = alias_dict[val] if not discard else [
5936 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5937 # NB: Do not allow regex in aliases for performance
5938 requested = orderedSet_from_options(val, alias_dict, start=requested)
5939 continue
5940
5941 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5942 else [val] if val in alias_dict['all'] else None)
5943 if current is None:
5944 raise ValueError(val)
5945
5946 if discard:
5947 for item in current:
5948 while item in requested:
5949 requested.remove(item)
5950 else:
5951 requested.extend(current)
5952
5953 return orderedSet(requested)
5954
5955
9b8ee23b 5956# Deprecated
5957has_certifi = bool(certifi)
5958has_websockets = bool(websockets)