]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
Fix bug in 170605840ea9d5ad75da6576485ea7d125b428ee
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
ab029d7e 8import collections.abc
62e609ab 9import contextlib
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
ac668111 17import html.entities
18import html.parser
54007a45 19import http.client
20import http.cookiejar
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
989a01c2 43import unicodedata
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
6929b41a 50from .compat import functools # isort: split
8c25f81b 51from .compat import (
36e6f62c 52 compat_etree_fromstring,
51098426 53 compat_expanduser,
f8271158 54 compat_HTMLParseError,
efa97bdc 55 compat_os_name,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57)
ac668111 58from .dependencies import brotli, certifi, websockets, xattr
f8271158 59from .socks import ProxyType, sockssocket
71aff188 60
4644ac55 61
51fb4995
YCH
62def register_socks_protocols():
63 # "Register" SOCKS protocols
d5ae6bb5
YCH
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
69
70
468e2e92
FV
71# This is not clearly defined otherwise
72compiled_regex_type = type(re.compile(''))
73
f7a147e3
S
74
75def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
19b4c74d 78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
f7a147e3
S
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
4390d5ec 120SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122]
9b8ee23b 123if brotli:
4390d5ec 124 SUPPORTED_ENCODINGS.append('br')
125
3e669f36 126std_headers = {
f7a147e3 127 'User-Agent': random_user_agent(),
59ae15a5 128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 129 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 130 'Sec-Fetch-Mode': 'navigate',
3e669f36 131}
f427df17 132
5f6a1245 133
fb37eb25
S
134USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136}
137
138
bf42a990 139NO_DEFAULT = object()
7b2c3f47 140IDENTITY = lambda x: x
bf42a990 141
7105440c
YCH
142ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
f6717dec
S
146MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
3e4185c3
S
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
78545664 151 # these follow the genitive grammatical case (dopełniacz)
152 # some websites might be using nominative, which will require another month list
153 # https://en.wikibooks.org/wiki/Polish/Noun_cases
154 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
155 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
f6717dec 156}
a942d6cb 157
8f53dc44 158# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
159TIMEZONE_NAMES = {
160 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
161 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
162 'EST': -5, 'EDT': -4, # Eastern
163 'CST': -6, 'CDT': -5, # Central
164 'MST': -7, 'MDT': -6, # Mountain
165 'PST': -8, 'PDT': -7 # Pacific
166}
167
c587cbb7 168# needed for sanitizing filenames in restricted mode
c8827027 169ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
170 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
171 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 172
46f59e89
S
173DATE_FORMATS = (
174 '%d %B %Y',
175 '%d %b %Y',
176 '%B %d %Y',
cb655f34
S
177 '%B %dst %Y',
178 '%B %dnd %Y',
9d30c213 179 '%B %drd %Y',
cb655f34 180 '%B %dth %Y',
46f59e89 181 '%b %d %Y',
cb655f34
S
182 '%b %dst %Y',
183 '%b %dnd %Y',
9d30c213 184 '%b %drd %Y',
cb655f34 185 '%b %dth %Y',
46f59e89
S
186 '%b %dst %Y %I:%M',
187 '%b %dnd %Y %I:%M',
9d30c213 188 '%b %drd %Y %I:%M',
46f59e89
S
189 '%b %dth %Y %I:%M',
190 '%Y %m %d',
191 '%Y-%m-%d',
bccdbd22 192 '%Y.%m.%d.',
46f59e89 193 '%Y/%m/%d',
81c13222 194 '%Y/%m/%d %H:%M',
46f59e89 195 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
196 '%Y%m%d%H%M',
197 '%Y%m%d%H%M%S',
4f3fa23e 198 '%Y%m%d',
0c1c6f4b 199 '%Y-%m-%d %H:%M',
46f59e89
S
200 '%Y-%m-%d %H:%M:%S',
201 '%Y-%m-%d %H:%M:%S.%f',
5014558a 202 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
203 '%d.%m.%Y %H:%M',
204 '%d.%m.%Y %H.%M',
205 '%Y-%m-%dT%H:%M:%SZ',
206 '%Y-%m-%dT%H:%M:%S.%fZ',
207 '%Y-%m-%dT%H:%M:%S.%f0Z',
208 '%Y-%m-%dT%H:%M:%S',
209 '%Y-%m-%dT%H:%M:%S.%f',
210 '%Y-%m-%dT%H:%M',
c6eed6b8
S
211 '%b %d %Y at %H:%M',
212 '%b %d %Y at %H:%M:%S',
b555ae9b
S
213 '%B %d %Y at %H:%M',
214 '%B %d %Y at %H:%M:%S',
a63d9bd0 215 '%H:%M %d-%b-%Y',
46f59e89
S
216)
217
218DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
219DATE_FORMATS_DAY_FIRST.extend([
220 '%d-%m-%Y',
221 '%d.%m.%Y',
222 '%d.%m.%y',
223 '%d/%m/%Y',
224 '%d/%m/%y',
225 '%d/%m/%Y %H:%M:%S',
47304e07 226 '%d-%m-%Y %H:%M',
46f59e89
S
227])
228
229DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
230DATE_FORMATS_MONTH_FIRST.extend([
231 '%m-%d-%Y',
232 '%m.%d.%Y',
233 '%m/%d/%Y',
234 '%m/%d/%y',
235 '%m/%d/%Y %H:%M:%S',
236])
237
06b3fe29 238PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
0f60ba6e 239JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
06b3fe29 240
1d485a1a 241NUMBER_RE = r'\d+(?:\.\d+)?'
242
7105440c 243
0b9c08b4 244@functools.cache
d77c3dfd 245def preferredencoding():
59ae15a5 246 """Get preferred encoding.
d77c3dfd 247
59ae15a5
PH
248 Returns the best encoding scheme for the system, based on
249 locale.getpreferredencoding() and some further tweaks.
250 """
251 try:
252 pref = locale.getpreferredencoding()
28e614de 253 'TEST'.encode(pref)
70a1165b 254 except Exception:
59ae15a5 255 pref = 'UTF-8'
bae611f2 256
59ae15a5 257 return pref
d77c3dfd 258
f4bfd65f 259
181c8655 260def write_json_file(obj, fn):
1394646a 261 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 262
cfb0511d 263 tf = tempfile.NamedTemporaryFile(
264 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
265 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
266
267 try:
268 with tf:
45d86abe 269 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
270 if sys.platform == 'win32':
271 # Need to remove existing file on Windows, else os.rename raises
272 # WindowsError or FileExistsError.
19a03940 273 with contextlib.suppress(OSError):
1394646a 274 os.unlink(fn)
19a03940 275 with contextlib.suppress(OSError):
9cd5f54e
R
276 mask = os.umask(0)
277 os.umask(mask)
278 os.chmod(tf.name, 0o666 & ~mask)
181c8655 279 os.rename(tf.name, fn)
70a1165b 280 except Exception:
19a03940 281 with contextlib.suppress(OSError):
181c8655 282 os.remove(tf.name)
181c8655
PH
283 raise
284
285
cfb0511d 286def find_xpath_attr(node, xpath, key, val=None):
287 """ Find the xpath xpath[@key=val] """
288 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 289 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 290 return node.find(expr)
59ae56fa 291
d7e66d39
JMF
292# On python2.6 the xml.etree.ElementTree.Element methods don't support
293# the namespace parameter
5f6a1245
JW
294
295
d7e66d39
JMF
296def xpath_with_ns(path, ns_map):
297 components = [c.split(':') for c in path.split('/')]
298 replaced = []
299 for c in components:
300 if len(c) == 1:
301 replaced.append(c[0])
302 else:
303 ns, tag = c
304 replaced.append('{%s}%s' % (ns_map[ns], tag))
305 return '/'.join(replaced)
306
d77c3dfd 307
a41fb80c 308def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 309 def _find_xpath(xpath):
f9934b96 310 return node.find(xpath)
578c0745 311
14f25df2 312 if isinstance(xpath, str):
578c0745
S
313 n = _find_xpath(xpath)
314 else:
315 for xp in xpath:
316 n = _find_xpath(xp)
317 if n is not None:
318 break
d74bebd5 319
8e636da4 320 if n is None:
bf42a990
S
321 if default is not NO_DEFAULT:
322 return default
323 elif fatal:
bf0ff932
PH
324 name = xpath if name is None else name
325 raise ExtractorError('Could not find XML element %s' % name)
326 else:
327 return None
a41fb80c
S
328 return n
329
330
331def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
332 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
333 if n is None or n == default:
334 return n
335 if n.text is None:
336 if default is not NO_DEFAULT:
337 return default
338 elif fatal:
339 name = xpath if name is None else name
340 raise ExtractorError('Could not find XML element\'s text %s' % name)
341 else:
342 return None
343 return n.text
a41fb80c
S
344
345
346def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
347 n = find_xpath_attr(node, xpath, key)
348 if n is None:
349 if default is not NO_DEFAULT:
350 return default
351 elif fatal:
86e5f3ed 352 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
353 raise ExtractorError('Could not find XML attribute %s' % name)
354 else:
355 return None
356 return n.attrib[key]
bf0ff932
PH
357
358
c487cf00 359def get_element_by_id(id, html, **kwargs):
43e8fafd 360 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 361 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 362
12ea2f30 363
c487cf00 364def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 365 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 366 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
367
368
84c237fb 369def get_element_by_class(class_name, html):
2af12ad9
TC
370 """Return the content of the first tag with the specified class in the passed HTML document"""
371 retval = get_elements_by_class(class_name, html)
372 return retval[0] if retval else None
373
374
6f32a0b5
ZM
375def get_element_html_by_class(class_name, html):
376 """Return the html of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_html_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
c487cf00 381def get_element_by_attribute(attribute, value, html, **kwargs):
382 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
383 return retval[0] if retval else None
384
385
c487cf00 386def get_element_html_by_attribute(attribute, value, html, **kargs):
387 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
388 return retval[0] if retval else None
389
390
c487cf00 391def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
392 """Return the content of all tags with the specified class in the passed HTML document as a list"""
393 return get_elements_by_attribute(
64fa820c 394 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
395 html, escape_value=False)
396
397
6f32a0b5
ZM
398def get_elements_html_by_class(class_name, html):
399 """Return the html of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_html_by_attribute(
64fa820c 401 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
402 html, escape_value=False)
403
404
405def get_elements_by_attribute(*args, **kwargs):
43e8fafd 406 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
407 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
408
409
410def get_elements_html_by_attribute(*args, **kwargs):
411 """Return the html of the tag with the specified attribute in the passed HTML document"""
412 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
413
414
4c9a1a3b 415def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
6f32a0b5
ZM
416 """
417 Return the text (content) and the html (whole) of the tag with the specified
418 attribute in the passed HTML document
419 """
c61473c1
M
420 if not value:
421 return
9e6dd238 422
86e5f3ed 423 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 424
84c237fb
YCH
425 value = re.escape(value) if escape_value else value
426
86e5f3ed 427 partial_element_re = rf'''(?x)
4c9a1a3b 428 <(?P<tag>{tag})
0254f162 429 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 430 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
431 '''
38285056 432
0254f162
ZM
433 for m in re.finditer(partial_element_re, html):
434 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 435
0254f162
ZM
436 yield (
437 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
438 whole
439 )
a921f407 440
c5229f39 441
ac668111 442class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
443 """
444 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
445 closing tag for the first opening tag it has encountered, and can be used
446 as a context manager
447 """
448
449 class HTMLBreakOnClosingTagException(Exception):
450 pass
451
452 def __init__(self):
453 self.tagstack = collections.deque()
ac668111 454 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
455
456 def __enter__(self):
457 return self
458
459 def __exit__(self, *_):
460 self.close()
461
462 def close(self):
463 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
464 # so data remains buffered; we no longer have any interest in it, thus
465 # override this method to discard it
466 pass
467
468 def handle_starttag(self, tag, _):
469 self.tagstack.append(tag)
470
471 def handle_endtag(self, tag):
472 if not self.tagstack:
473 raise compat_HTMLParseError('no tags in the stack')
474 while self.tagstack:
475 inner_tag = self.tagstack.pop()
476 if inner_tag == tag:
477 break
478 else:
479 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
480 if not self.tagstack:
481 raise self.HTMLBreakOnClosingTagException()
482
483
46d09f87 484# XXX: This should be far less strict
6f32a0b5
ZM
485def get_element_text_and_html_by_tag(tag, html):
486 """
487 For the first element with the specified tag in the passed HTML document
488 return its' content (text) and the whole element (html)
489 """
490 def find_or_raise(haystack, needle, exc):
491 try:
492 return haystack.index(needle)
493 except ValueError:
494 raise exc
495 closing_tag = f'</{tag}>'
496 whole_start = find_or_raise(
497 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
498 content_start = find_or_raise(
499 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
500 content_start += whole_start + 1
501 with HTMLBreakOnClosingTagParser() as parser:
502 parser.feed(html[whole_start:content_start])
503 if not parser.tagstack or parser.tagstack[0] != tag:
504 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
505 offset = content_start
506 while offset < len(html):
507 next_closing_tag_start = find_or_raise(
508 html[offset:], closing_tag,
509 compat_HTMLParseError(f'closing {tag} tag not found'))
510 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
511 try:
512 parser.feed(html[offset:offset + next_closing_tag_end])
513 offset += next_closing_tag_end
514 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
515 return html[content_start:offset + next_closing_tag_start], \
516 html[whole_start:offset + next_closing_tag_end]
517 raise compat_HTMLParseError('unexpected end of html')
518
519
ac668111 520class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 521 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 522
8bb56eee 523 def __init__(self):
c5229f39 524 self.attrs = {}
ac668111 525 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
526
527 def handle_starttag(self, tag, attrs):
528 self.attrs = dict(attrs)
7053aa3a 529 raise compat_HTMLParseError('done')
8bb56eee 530
c5229f39 531
ac668111 532class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
533 """HTML parser to gather the attributes for the elements of a list"""
534
535 def __init__(self):
ac668111 536 html.parser.HTMLParser.__init__(self)
73673ccf
FF
537 self.items = []
538 self._level = 0
539
540 def handle_starttag(self, tag, attrs):
541 if tag == 'li' and self._level == 0:
542 self.items.append(dict(attrs))
543 self._level += 1
544
545 def handle_endtag(self, tag):
546 self._level -= 1
547
548
8bb56eee
BF
549def extract_attributes(html_element):
550 """Given a string for an HTML element such as
551 <el
552 a="foo" B="bar" c="&98;az" d=boz
553 empty= noval entity="&amp;"
554 sq='"' dq="'"
555 >
556 Decode and return a dictionary of attributes.
557 {
558 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
559 'empty': '', 'noval': None, 'entity': '&',
560 'sq': '"', 'dq': '\''
561 }.
8bb56eee
BF
562 """
563 parser = HTMLAttributeParser()
19a03940 564 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
565 parser.feed(html_element)
566 parser.close()
8bb56eee 567 return parser.attrs
9e6dd238 568
c5229f39 569
73673ccf
FF
570def parse_list(webpage):
571 """Given a string for an series of HTML <li> elements,
572 return a dictionary of their attributes"""
573 parser = HTMLListAttrsParser()
574 parser.feed(webpage)
575 parser.close()
576 return parser.items
577
578
9e6dd238 579def clean_html(html):
59ae15a5 580 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
581
582 if html is None: # Convenience for sanitizing descriptions etc.
583 return html
584
49185227 585 html = re.sub(r'\s+', ' ', html)
586 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
587 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
588 # Strip html tags
589 html = re.sub('<.*?>', '', html)
590 # Replace html entities
591 html = unescapeHTML(html)
7decf895 592 return html.strip()
9e6dd238
FV
593
594
b7c47b74 595class LenientJSONDecoder(json.JSONDecoder):
cc090836 596 # TODO: Write tests
597 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
b7c47b74 598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
cc090836 599 self._close_attempts = 2 * close_objects
b7c47b74 600 super().__init__(*args, **kwargs)
601
cc090836 602 @staticmethod
603 def _close_object(err):
604 doc = err.doc[:err.pos]
605 # We need to add comma first to get the correct error message
606 if err.msg.startswith('Expecting \',\''):
607 return doc + ','
608 elif not doc.endswith(','):
609 return
610
611 if err.msg.startswith('Expecting property name'):
612 return doc[:-1] + '}'
613 elif err.msg.startswith('Expecting value'):
614 return doc[:-1] + ']'
615
b7c47b74 616 def decode(self, s):
617 if self.transform_source:
618 s = self.transform_source(s)
cc090836 619 for attempt in range(self._close_attempts + 1):
620 try:
621 if self.ignore_extra:
622 return self.raw_decode(s.lstrip())[0]
623 return super().decode(s)
624 except json.JSONDecodeError as e:
625 if e.pos is None:
626 raise
627 elif attempt < self._close_attempts:
628 s = self._close_object(e)
629 if s is not None:
630 continue
2fa669f7 631 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
cc090836 632 assert False, 'Too many attempts to decode JSON'
b7c47b74 633
634
d77c3dfd 635def sanitize_open(filename, open_mode):
59ae15a5
PH
636 """Try to open the given filename, and slightly tweak it if this fails.
637
638 Attempts to open the given filename. If this fails, it tries to change
639 the filename slightly, step by step, until it's either able to open it
640 or it fails and raises a final exception, like the standard open()
641 function.
642
643 It returns the tuple (stream, definitive_file_name).
644 """
0edb3e33 645 if filename == '-':
646 if sys.platform == 'win32':
647 import msvcrt
be5c1ae8 648
62b58c09 649 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 650 with contextlib.suppress(io.UnsupportedOperation):
651 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 652 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 653
0edb3e33 654 for attempt in range(2):
655 try:
656 try:
89737671 657 if sys.platform == 'win32':
b506289f 658 # FIXME: An exclusive lock also locks the file from being read.
659 # Since windows locks are mandatory, don't lock the file on windows (for now).
660 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 661 raise LockingUnsupportedError()
0edb3e33 662 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 663 except OSError:
0edb3e33 664 stream = open(filename, open_mode)
8a82af35 665 return stream, filename
86e5f3ed 666 except OSError as err:
0edb3e33 667 if attempt or err.errno in (errno.EACCES,):
668 raise
669 old_filename, filename = filename, sanitize_path(filename)
670 if old_filename == filename:
671 raise
d77c3dfd
FV
672
673
674def timeconvert(timestr):
59ae15a5
PH
675 """Convert RFC 2822 defined time string into system timestamp"""
676 timestamp = None
677 timetuple = email.utils.parsedate_tz(timestr)
678 if timetuple is not None:
679 timestamp = email.utils.mktime_tz(timetuple)
680 return timestamp
1c469a94 681
5f6a1245 682
5c3895ff 683def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 684 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 685 @param restricted Use a stricter subset of allowed characters
686 @param is_id Whether this is an ID that should be kept unchanged if possible.
687 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 688 """
5c3895ff 689 if s == '':
690 return ''
691
59ae15a5 692 def replace_insane(char):
c587cbb7
AT
693 if restricted and char in ACCENT_CHARS:
694 return ACCENT_CHARS[char]
91dd88b9 695 elif not restricted and char == '\n':
5c3895ff 696 return '\0 '
989a01c2 697 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
698 # Replace with their full-width unicode counterparts
699 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 700 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
701 return ''
702 elif char == '"':
703 return '' if restricted else '\''
704 elif char == ':':
5c3895ff 705 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 706 elif char in '\\/|*<>':
5c3895ff 707 return '\0_'
708 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
709 return '\0_'
59ae15a5
PH
710 return char
711
db4678e4 712 # Replace look-alike Unicode glyphs
713 if restricted and (is_id is NO_DEFAULT or not is_id):
989a01c2 714 s = unicodedata.normalize('NFKC', s)
5c3895ff 715 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 716 result = ''.join(map(replace_insane, s))
5c3895ff 717 if is_id is NO_DEFAULT:
ae61d108 718 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
719 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 720 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
721 result = result.replace('\0', '') or '_'
722
796173d0
PH
723 if not is_id:
724 while '__' in result:
725 result = result.replace('__', '_')
726 result = result.strip('_')
727 # Common case of "Foreign band name - English song title"
728 if restricted and result.startswith('-_'):
729 result = result[2:]
5a42414b
PH
730 if result.startswith('-'):
731 result = '_' + result[len('-'):]
a7440261 732 result = result.lstrip('.')
796173d0
PH
733 if not result:
734 result = '_'
59ae15a5 735 return result
d77c3dfd 736
5f6a1245 737
c2934512 738def sanitize_path(s, force=False):
a2aaf4db 739 """Sanitizes and normalizes path on Windows"""
c2934512 740 if sys.platform == 'win32':
c4218ac3 741 force = False
c2934512 742 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 743 elif force:
744 drive_or_unc = ''
745 else:
a2aaf4db 746 return s
c2934512 747
be531ef1
S
748 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
749 if drive_or_unc:
a2aaf4db
S
750 norm_path.pop(0)
751 sanitized_path = [
ec85ded8 752 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 753 for path_part in norm_path]
be531ef1
S
754 if drive_or_unc:
755 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 756 elif force and s and s[0] == os.path.sep:
c4218ac3 757 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
758 return os.path.join(*sanitized_path)
759
760
8f97a15d 761def sanitize_url(url, *, scheme='http'):
befa4708
S
762 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
763 # the number of unwanted failures due to missing protocol
21633673 764 if url is None:
765 return
766 elif url.startswith('//'):
8f97a15d 767 return f'{scheme}:{url}'
befa4708
S
768 # Fix some common typos seen so far
769 COMMON_TYPOS = (
067aa17e 770 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
771 (r'^httpss://', r'https://'),
772 # https://bx1.be/lives/direct-tv/
773 (r'^rmtp([es]?)://', r'rtmp\1://'),
774 )
775 for mistake, fixup in COMMON_TYPOS:
776 if re.match(mistake, url):
777 return re.sub(mistake, fixup, url)
bc6b9bcd 778 return url
17bcc626
S
779
780
5435dcf9 781def extract_basic_auth(url):
14f25df2 782 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
783 if parts.username is None:
784 return url, None
14f25df2 785 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
786 parts.hostname if parts.port is None
787 else '%s:%d' % (parts.hostname, parts.port))))
788 auth_payload = base64.b64encode(
0f06bcd7 789 ('%s:%s' % (parts.username, parts.password or '')).encode())
790 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
791
792
67dda517 793def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 794 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
795 if auth_header is not None:
796 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
797 headers['Authorization'] = auth_header
ac668111 798 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
799
800
51098426 801def expand_path(s):
2fa669f7 802 """Expand shell variables and ~"""
51098426
S
803 return os.path.expandvars(compat_expanduser(s))
804
805
7e9a6125 806def orderedSet(iterable, *, lazy=False):
807 """Remove all duplicates from the input iterable"""
808 def _iter():
809 seen = [] # Do not use set since the items can be unhashable
810 for x in iterable:
811 if x not in seen:
812 seen.append(x)
813 yield x
814
815 return _iter() if lazy else list(_iter())
d77c3dfd 816
912b38b4 817
55b2f099 818def _htmlentity_transform(entity_with_semicolon):
4e408e47 819 """Transforms an HTML entity to a character."""
55b2f099
YCH
820 entity = entity_with_semicolon[:-1]
821
4e408e47 822 # Known non-numeric HTML entity
ac668111 823 if entity in html.entities.name2codepoint:
824 return chr(html.entities.name2codepoint[entity])
4e408e47 825
62b58c09
L
826 # TODO: HTML5 allows entities without a semicolon.
827 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 828 if entity_with_semicolon in html.entities.html5:
829 return html.entities.html5[entity_with_semicolon]
55b2f099 830
91757b0f 831 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
832 if mobj is not None:
833 numstr = mobj.group(1)
28e614de 834 if numstr.startswith('x'):
4e408e47 835 base = 16
28e614de 836 numstr = '0%s' % numstr
4e408e47
PH
837 else:
838 base = 10
067aa17e 839 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 840 with contextlib.suppress(ValueError):
ac668111 841 return chr(int(numstr, base))
4e408e47
PH
842
843 # Unknown entity in name, return its literal representation
7a3f0c00 844 return '&%s;' % entity
4e408e47
PH
845
846
d77c3dfd 847def unescapeHTML(s):
912b38b4
PH
848 if s is None:
849 return None
19a03940 850 assert isinstance(s, str)
d77c3dfd 851
4e408e47 852 return re.sub(
95f3f7c2 853 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 854
8bf48f23 855
cdb19aa4 856def escapeHTML(text):
857 return (
858 text
859 .replace('&', '&amp;')
860 .replace('<', '&lt;')
861 .replace('>', '&gt;')
862 .replace('"', '&quot;')
863 .replace("'", '&#39;')
864 )
865
866
f5b1bca9 867def process_communicate_or_kill(p, *args, **kwargs):
da4db748 868 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
869 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 870 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 871
872
d3c93ec2 873class Popen(subprocess.Popen):
874 if sys.platform == 'win32':
875 _startupinfo = subprocess.STARTUPINFO()
876 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
877 else:
878 _startupinfo = None
879
82ea226c
L
880 @staticmethod
881 def _fix_pyinstaller_ld_path(env):
882 """Restore LD_LIBRARY_PATH when using PyInstaller
883 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
884 https://github.com/yt-dlp/yt-dlp/issues/4573
885 """
886 if not hasattr(sys, '_MEIPASS'):
887 return
888
889 def _fix(key):
890 orig = env.get(f'{key}_ORIG')
891 if orig is None:
892 env.pop(key, None)
893 else:
894 env[key] = orig
895
896 _fix('LD_LIBRARY_PATH') # Linux
897 _fix('DYLD_LIBRARY_PATH') # macOS
898
899 def __init__(self, *args, env=None, text=False, **kwargs):
900 if env is None:
901 env = os.environ.copy()
902 self._fix_pyinstaller_ld_path(env)
903
da8e2912 904 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
f0c9fb96 905 if text is True:
906 kwargs['universal_newlines'] = True # For 3.6 compatibility
907 kwargs.setdefault('encoding', 'utf-8')
908 kwargs.setdefault('errors', 'replace')
82ea226c 909 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 910
911 def communicate_or_kill(self, *args, **kwargs):
8a82af35 912 try:
913 return self.communicate(*args, **kwargs)
914 except BaseException: # Including KeyboardInterrupt
f0c9fb96 915 self.kill(timeout=None)
8a82af35 916 raise
d3c93ec2 917
f0c9fb96 918 def kill(self, *, timeout=0):
919 super().kill()
920 if timeout != 0:
921 self.wait(timeout=timeout)
922
923 @classmethod
992dc6b4 924 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 925 with cls(*args, **kwargs) as proc:
da8e2912 926 default = '' if proc.__text_mode else b''
992dc6b4 927 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
914491b8 928 return stdout or default, stderr or default, proc.returncode
f0c9fb96 929
d3c93ec2 930
aa49acd1
S
931def get_subprocess_encoding():
932 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
933 # For subprocess calls, encode with locale encoding
934 # Refer to http://stackoverflow.com/a/9951851/35070
935 encoding = preferredencoding()
936 else:
937 encoding = sys.getfilesystemencoding()
938 if encoding is None:
939 encoding = 'utf-8'
940 return encoding
941
942
8bf48f23 943def encodeFilename(s, for_subprocess=False):
19a03940 944 assert isinstance(s, str)
cfb0511d 945 return s
aa49acd1
S
946
947
948def decodeFilename(b, for_subprocess=False):
cfb0511d 949 return b
8bf48f23 950
f07b74fc
PH
951
952def encodeArgument(s):
cfb0511d 953 # Legacy code that uses byte strings
954 # Uncomment the following line after fixing all post processors
14f25df2 955 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 956 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
957
958
aa49acd1 959def decodeArgument(b):
cfb0511d 960 return b
aa49acd1
S
961
962
8271226a
PH
963def decodeOption(optval):
964 if optval is None:
965 return optval
966 if isinstance(optval, bytes):
967 optval = optval.decode(preferredencoding())
968
14f25df2 969 assert isinstance(optval, str)
8271226a 970 return optval
1c256f70 971
5f6a1245 972
aa7785f8 973_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
974
975
976def timetuple_from_msec(msec):
977 secs, msec = divmod(msec, 1000)
978 mins, secs = divmod(secs, 60)
979 hrs, mins = divmod(mins, 60)
980 return _timetuple(hrs, mins, secs, msec)
981
982
cdb19aa4 983def formatSeconds(secs, delim=':', msec=False):
aa7785f8 984 time = timetuple_from_msec(secs * 1000)
985 if time.hours:
986 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
987 elif time.minutes:
988 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 989 else:
aa7785f8 990 ret = '%d' % time.seconds
991 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 992
a0ddb8a2 993
77562778 994def _ssl_load_windows_store_certs(ssl_context, storename):
995 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
996 try:
997 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
998 if encoding == 'x509_asn' and (
999 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
1000 except PermissionError:
1001 return
1002 for cert in certs:
19a03940 1003 with contextlib.suppress(ssl.SSLError):
77562778 1004 ssl_context.load_verify_locations(cadata=cert)
a2366922 1005
77562778 1006
1007def make_HTTPS_handler(params, **kwargs):
1008 opts_check_certificate = not params.get('nocheckcertificate')
1009 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1010 context.check_hostname = opts_check_certificate
f81c62a6 1011 if params.get('legacyserverconnect'):
1012 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 1013 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
1014 context.set_ciphers('DEFAULT')
ac8e69dd
M
1015 elif (
1016 sys.version_info < (3, 10)
1017 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
1018 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
1019 ):
5b9f253f
M
1020 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
1021 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1022 # in some situations [2][3].
1023 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1024 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
ac8e69dd 1025 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
5b9f253f
M
1026 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1027 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1028 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1029 # 4. https://peps.python.org/pep-0644/
ac8e69dd
M
1030 # 5. https://peps.python.org/pep-0644/#libressl-support
1031 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
5b9f253f
M
1032 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1033 context.minimum_version = ssl.TLSVersion.TLSv1_2
8a82af35 1034
77562778 1035 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1036 if opts_check_certificate:
d5820461 1037 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1038 context.load_verify_locations(cafile=certifi.where())
168bbc4f 1039 else:
1040 try:
1041 context.load_default_certs()
1042 # Work around the issue in load_default_certs when there are bad certificates. See:
1043 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1044 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1045 except ssl.SSLError:
1046 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1047 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1048 for storename in ('CA', 'ROOT'):
1049 _ssl_load_windows_store_certs(context, storename)
1050 context.set_default_verify_paths()
8a82af35 1051
bb58c9ed 1052 client_certfile = params.get('client_certificate')
1053 if client_certfile:
1054 try:
1055 context.load_cert_chain(
1056 client_certfile, keyfile=params.get('client_certificate_key'),
1057 password=params.get('client_certificate_password'))
1058 except ssl.SSLError:
1059 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1060
1061 # Some servers may reject requests if ALPN extension is not sent. See:
1062 # https://github.com/python/cpython/issues/85140
1063 # https://github.com/yt-dlp/yt-dlp/issues/3878
1064 with contextlib.suppress(NotImplementedError):
1065 context.set_alpn_protocols(['http/1.1'])
1066
77562778 1067 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1068
732ea2f0 1069
5873d4cc 1070def bug_reports_message(before=';'):
57e0f077 1071 from .update import REPOSITORY
1072
1073 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1074 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1075
1076 before = before.rstrip()
1077 if not before or before.endswith(('.', '!', '?')):
1078 msg = msg[0].title() + msg[1:]
1079
1080 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1081
1082
bf5b9d85
PM
1083class YoutubeDLError(Exception):
1084 """Base exception for YoutubeDL errors."""
aa9369a2 1085 msg = None
1086
1087 def __init__(self, msg=None):
1088 if msg is not None:
1089 self.msg = msg
1090 elif self.msg is None:
1091 self.msg = type(self).__name__
1092 super().__init__(self.msg)
bf5b9d85
PM
1093
1094
ac668111 1095network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1096if hasattr(ssl, 'CertificateError'):
1097 network_exceptions.append(ssl.CertificateError)
1098network_exceptions = tuple(network_exceptions)
1099
1100
bf5b9d85 1101class ExtractorError(YoutubeDLError):
1c256f70 1102 """Error during info extraction."""
5f6a1245 1103
1151c407 1104 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1105 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1106 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1107 """
3158150c 1108 if sys.exc_info()[0] in network_exceptions:
9a82b238 1109 expected = True
d5979c5d 1110
7265a219 1111 self.orig_msg = str(msg)
1c256f70 1112 self.traceback = tb
1151c407 1113 self.expected = expected
2eabb802 1114 self.cause = cause
d11271dd 1115 self.video_id = video_id
1151c407 1116 self.ie = ie
1117 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1118 if isinstance(self.exc_info[1], ExtractorError):
1119 self.exc_info = self.exc_info[1].exc_info
9bcfe33b 1120 super().__init__(self.__msg)
1151c407 1121
9bcfe33b 1122 @property
1123 def __msg(self):
1124 return ''.join((
1125 format_field(self.ie, None, '[%s] '),
1126 format_field(self.video_id, None, '%s: '),
1127 self.orig_msg,
1128 format_field(self.cause, None, ' (caused by %r)'),
1129 '' if self.expected else bug_reports_message()))
1c256f70 1130
01951dda 1131 def format_traceback(self):
497d2fab 1132 return join_nonempty(
1133 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1134 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1135 delim='\n') or None
01951dda 1136
9bcfe33b 1137 def __setattr__(self, name, value):
1138 super().__setattr__(name, value)
1139 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1140 self.msg = self.__msg or type(self).__name__
1141 self.args = (self.msg, ) # Cannot be property
1142
1c256f70 1143
416c7fcb
PH
1144class UnsupportedError(ExtractorError):
1145 def __init__(self, url):
86e5f3ed 1146 super().__init__(
416c7fcb
PH
1147 'Unsupported URL: %s' % url, expected=True)
1148 self.url = url
1149
1150
55b3e45b
JMF
1151class RegexNotFoundError(ExtractorError):
1152 """Error when a regex didn't match"""
1153 pass
1154
1155
773f291d
S
1156class GeoRestrictedError(ExtractorError):
1157 """Geographic restriction Error exception.
1158
1159 This exception may be thrown when a video is not available from your
1160 geographic location due to geographic restrictions imposed by a website.
1161 """
b6e0c7d2 1162
0db3bae8 1163 def __init__(self, msg, countries=None, **kwargs):
1164 kwargs['expected'] = True
86e5f3ed 1165 super().__init__(msg, **kwargs)
773f291d
S
1166 self.countries = countries
1167
1168
693f0600 1169class UserNotLive(ExtractorError):
1170 """Error when a channel/user is not live"""
1171
1172 def __init__(self, msg=None, **kwargs):
1173 kwargs['expected'] = True
1174 super().__init__(msg or 'The channel is not currently live', **kwargs)
1175
1176
bf5b9d85 1177class DownloadError(YoutubeDLError):
59ae15a5 1178 """Download Error exception.
d77c3dfd 1179
59ae15a5
PH
1180 This exception may be thrown by FileDownloader objects if they are not
1181 configured to continue on errors. They will contain the appropriate
1182 error message.
1183 """
5f6a1245 1184
8cc83b8d
FV
1185 def __init__(self, msg, exc_info=None):
1186 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1187 super().__init__(msg)
8cc83b8d 1188 self.exc_info = exc_info
d77c3dfd
FV
1189
1190
498f5606 1191class EntryNotInPlaylist(YoutubeDLError):
1192 """Entry not in playlist exception.
1193
1194 This exception will be thrown by YoutubeDL when a requested entry
1195 is not found in the playlist info_dict
1196 """
aa9369a2 1197 msg = 'Entry not found in info'
498f5606 1198
1199
bf5b9d85 1200class SameFileError(YoutubeDLError):
59ae15a5 1201 """Same File exception.
d77c3dfd 1202
59ae15a5
PH
1203 This exception will be thrown by FileDownloader objects if they detect
1204 multiple files would have to be downloaded to the same file on disk.
1205 """
aa9369a2 1206 msg = 'Fixed output name but more than one file to download'
1207
1208 def __init__(self, filename=None):
1209 if filename is not None:
1210 self.msg += f': {filename}'
1211 super().__init__(self.msg)
d77c3dfd
FV
1212
1213
bf5b9d85 1214class PostProcessingError(YoutubeDLError):
59ae15a5 1215 """Post Processing exception.
d77c3dfd 1216
59ae15a5
PH
1217 This exception may be raised by PostProcessor's .run() method to
1218 indicate an error in the postprocessing task.
1219 """
5f6a1245 1220
5f6a1245 1221
48f79687 1222class DownloadCancelled(YoutubeDLError):
1223 """ Exception raised when the download queue should be interrupted """
1224 msg = 'The download was cancelled'
8b0d7497 1225
8b0d7497 1226
48f79687 1227class ExistingVideoReached(DownloadCancelled):
1228 """ --break-on-existing triggered """
1229 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1230
48f79687 1231
1232class RejectedVideoReached(DownloadCancelled):
fe2ce85a 1233 """ --break-match-filter triggered """
1234 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
51d9739f 1235
1236
48f79687 1237class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1238 """ --max-downloads limit has been reached. """
48f79687 1239 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1240
1241
f2ebc5c7 1242class ReExtractInfo(YoutubeDLError):
1243 """ Video info needs to be re-extracted. """
1244
1245 def __init__(self, msg, expected=False):
1246 super().__init__(msg)
1247 self.expected = expected
1248
1249
1250class ThrottledDownload(ReExtractInfo):
48f79687 1251 """ Download speed below --throttled-rate. """
aa9369a2 1252 msg = 'The download speed is below throttle limit'
d77c3dfd 1253
43b22906 1254 def __init__(self):
1255 super().__init__(self.msg, expected=False)
f2ebc5c7 1256
d77c3dfd 1257
bf5b9d85 1258class UnavailableVideoError(YoutubeDLError):
59ae15a5 1259 """Unavailable Format exception.
d77c3dfd 1260
59ae15a5
PH
1261 This exception will be thrown when a video is requested
1262 in a format that is not available for that video.
1263 """
aa9369a2 1264 msg = 'Unable to download video'
1265
1266 def __init__(self, err=None):
1267 if err is not None:
1268 self.msg += f': {err}'
1269 super().__init__(self.msg)
d77c3dfd
FV
1270
1271
bf5b9d85 1272class ContentTooShortError(YoutubeDLError):
59ae15a5 1273 """Content Too Short exception.
d77c3dfd 1274
59ae15a5
PH
1275 This exception may be raised by FileDownloader objects when a file they
1276 download is too small for what the server announced first, indicating
1277 the connection was probably interrupted.
1278 """
d77c3dfd 1279
59ae15a5 1280 def __init__(self, downloaded, expected):
86e5f3ed 1281 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1282 # Both in bytes
59ae15a5
PH
1283 self.downloaded = downloaded
1284 self.expected = expected
d77c3dfd 1285
5f6a1245 1286
bf5b9d85 1287class XAttrMetadataError(YoutubeDLError):
efa97bdc 1288 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1289 super().__init__(msg)
efa97bdc 1290 self.code = code
bd264412 1291 self.msg = msg
efa97bdc
YCH
1292
1293 # Parsing code and msg
3089bc74 1294 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1295 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1296 self.reason = 'NO_SPACE'
1297 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1298 self.reason = 'VALUE_TOO_LONG'
1299 else:
1300 self.reason = 'NOT_SUPPORTED'
1301
1302
bf5b9d85 1303class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1304 pass
1305
1306
c5a59d93 1307def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1308 hc = http_class(*args, **kwargs)
be4a824d 1309 source_address = ydl_handler._params.get('source_address')
8959018a 1310
be4a824d 1311 if source_address is not None:
8959018a
AU
1312 # This is to workaround _create_connection() from socket where it will try all
1313 # address data from getaddrinfo() including IPv6. This filters the result from
1314 # getaddrinfo() based on the source_address value.
1315 # This is based on the cpython socket.create_connection() function.
1316 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1317 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1318 host, port = address
1319 err = None
1320 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1321 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1322 ip_addrs = [addr for addr in addrs if addr[0] == af]
1323 if addrs and not ip_addrs:
1324 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1325 raise OSError(
9e21e6d9
S
1326 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1327 % (ip_version, source_address[0]))
8959018a
AU
1328 for res in ip_addrs:
1329 af, socktype, proto, canonname, sa = res
1330 sock = None
1331 try:
1332 sock = socket.socket(af, socktype, proto)
1333 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1334 sock.settimeout(timeout)
1335 sock.bind(source_address)
1336 sock.connect(sa)
1337 err = None # Explicitly break reference cycle
1338 return sock
86e5f3ed 1339 except OSError as _:
8959018a
AU
1340 err = _
1341 if sock is not None:
1342 sock.close()
1343 if err is not None:
1344 raise err
1345 else:
86e5f3ed 1346 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1347 if hasattr(hc, '_create_connection'):
1348 hc._create_connection = _create_connection
cfb0511d 1349 hc.source_address = (source_address, 0)
be4a824d
PH
1350
1351 return hc
1352
1353
87f0e62d 1354def handle_youtubedl_headers(headers):
992fc9d6
YCH
1355 filtered_headers = headers
1356
1357 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1358 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1359 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1360
992fc9d6 1361 return filtered_headers
87f0e62d
YCH
1362
1363
ac668111 1364class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1365 """Handler for HTTP requests and responses.
1366
1367 This class, when installed with an OpenerDirector, automatically adds
1368 the standard headers to every HTTP request and handles gzipped and
1369 deflated responses from web servers. If compression is to be avoided in
1370 a particular request, the original request in the program code only has
0424ec30 1371 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1372 removed before making the real request.
1373
1374 Part of this code was copied from:
1375
1376 http://techknack.net/python-urllib2-handlers/
1377
1378 Andrew Rowls, the author of that code, agreed to release it to the
1379 public domain.
1380 """
1381
be4a824d 1382 def __init__(self, params, *args, **kwargs):
ac668111 1383 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1384 self._params = params
1385
1386 def http_open(self, req):
ac668111 1387 conn_class = http.client.HTTPConnection
71aff188
YCH
1388
1389 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1390 if socks_proxy:
1391 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1392 del req.headers['Ytdl-socks-proxy']
1393
be4a824d 1394 return self.do_open(functools.partial(
71aff188 1395 _create_http_connection, self, conn_class, False),
be4a824d
PH
1396 req)
1397
59ae15a5
PH
1398 @staticmethod
1399 def deflate(data):
fc2119f2 1400 if not data:
1401 return data
59ae15a5
PH
1402 try:
1403 return zlib.decompress(data, -zlib.MAX_WBITS)
1404 except zlib.error:
1405 return zlib.decompress(data)
1406
4390d5ec 1407 @staticmethod
1408 def brotli(data):
1409 if not data:
1410 return data
9b8ee23b 1411 return brotli.decompress(data)
4390d5ec 1412
acebc9cd 1413 def http_request(self, req):
51f267d9
S
1414 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1415 # always respected by websites, some tend to give out URLs with non percent-encoded
1416 # non-ASCII characters (see telemb.py, ard.py [#3412])
1417 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1418 # To work around aforementioned issue we will replace request's original URL with
1419 # percent-encoded one
1420 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1421 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1422 url = req.get_full_url()
1423 url_escaped = escape_url(url)
1424
1425 # Substitute URL if any change after escaping
1426 if url != url_escaped:
15d260eb 1427 req = update_Request(req, url=url_escaped)
51f267d9 1428
8b7539d2 1429 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1430 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1431 # The dict keys are capitalized because of this bug by urllib
1432 if h.capitalize() not in req.headers:
33ac271b 1433 req.add_header(h, v)
87f0e62d 1434
af14914b 1435 if 'Accept-encoding' not in req.headers:
1436 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1437
87f0e62d 1438 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1439
379a4f16 1440 return super().do_request_(req)
59ae15a5 1441
acebc9cd 1442 def http_response(self, req, resp):
59ae15a5
PH
1443 old_resp = resp
1444 # gzip
1445 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1446 content = resp.read()
1447 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1448 try:
1449 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1450 except OSError as original_ioerror:
aa3e9507
PH
1451 # There may be junk add the end of the file
1452 # See http://stackoverflow.com/q/4928560/35070 for details
1453 for i in range(1, 1024):
1454 try:
1455 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1456 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1457 except OSError:
aa3e9507
PH
1458 continue
1459 break
1460 else:
1461 raise original_ioerror
ac668111 1462 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
1463 resp.msg = old_resp.msg
1464 # deflate
1465 if resp.headers.get('Content-encoding', '') == 'deflate':
1466 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1467 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1468 resp.msg = old_resp.msg
4390d5ec 1469 # brotli
1470 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1471 resp = urllib.request.addinfourl(
4390d5ec 1472 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1473 resp.msg = old_resp.msg
ad729172 1474 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1475 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1476 if 300 <= resp.code < 400:
1477 location = resp.headers.get('Location')
1478 if location:
1479 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1480 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1481 location_escaped = escape_url(location)
1482 if location != location_escaped:
1483 del resp.headers['Location']
1484 resp.headers['Location'] = location_escaped
59ae15a5 1485 return resp
0f8d03f8 1486
acebc9cd
PH
1487 https_request = http_request
1488 https_response = http_response
bf50b038 1489
5de90176 1490
71aff188
YCH
1491def make_socks_conn_class(base_class, socks_proxy):
1492 assert issubclass(base_class, (
ac668111 1493 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1494
14f25df2 1495 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1496 if url_components.scheme.lower() == 'socks5':
1497 socks_type = ProxyType.SOCKS5
1498 elif url_components.scheme.lower() in ('socks', 'socks4'):
1499 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1500 elif url_components.scheme.lower() == 'socks4a':
1501 socks_type = ProxyType.SOCKS4A
71aff188 1502
cdd94c2e
YCH
1503 def unquote_if_non_empty(s):
1504 if not s:
1505 return s
ac668111 1506 return urllib.parse.unquote_plus(s)
cdd94c2e 1507
71aff188
YCH
1508 proxy_args = (
1509 socks_type,
1510 url_components.hostname, url_components.port or 1080,
1511 True, # Remote DNS
cdd94c2e
YCH
1512 unquote_if_non_empty(url_components.username),
1513 unquote_if_non_empty(url_components.password),
71aff188
YCH
1514 )
1515
1516 class SocksConnection(base_class):
1517 def connect(self):
1518 self.sock = sockssocket()
1519 self.sock.setproxy(*proxy_args)
19a03940 1520 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1521 self.sock.settimeout(self.timeout)
1522 self.sock.connect((self.host, self.port))
1523
ac668111 1524 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1525 if hasattr(self, '_context'): # Python > 2.6
1526 self.sock = self._context.wrap_socket(
1527 self.sock, server_hostname=self.host)
1528 else:
1529 self.sock = ssl.wrap_socket(self.sock)
1530
1531 return SocksConnection
1532
1533
ac668111 1534class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1535 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1536 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1537 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1538 self._params = params
1539
1540 def https_open(self, req):
4f264c02 1541 kwargs = {}
71aff188
YCH
1542 conn_class = self._https_conn_class
1543
4f264c02
JMF
1544 if hasattr(self, '_context'): # python > 2.6
1545 kwargs['context'] = self._context
1546 if hasattr(self, '_check_hostname'): # python 3.x
1547 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1548
1549 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1550 if socks_proxy:
1551 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1552 del req.headers['Ytdl-socks-proxy']
1553
4f28b537 1554 try:
1555 return self.do_open(
1556 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1557 except urllib.error.URLError as e:
1558 if (isinstance(e.reason, ssl.SSLError)
1559 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1560 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1561 raise
be4a824d
PH
1562
1563
941e881e 1564def is_path_like(f):
1565 return isinstance(f, (str, bytes, os.PathLike))
1566
1567
ac668111 1568class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1569 """
1570 See [1] for cookie file format.
1571
1572 1. https://curl.haxx.se/docs/http-cookies.html
1573 """
e7e62441 1574 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1575 _ENTRY_LEN = 7
1576 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1577# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1578
1579'''
1580 _CookieFileEntry = collections.namedtuple(
1581 'CookieFileEntry',
1582 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1583
d76fa1f3 1584 def __init__(self, filename=None, *args, **kwargs):
1585 super().__init__(None, *args, **kwargs)
941e881e 1586 if is_path_like(filename):
d76fa1f3 1587 filename = os.fspath(filename)
1588 self.filename = filename
1589
24146491 1590 @staticmethod
1591 def _true_or_false(cndn):
1592 return 'TRUE' if cndn else 'FALSE'
1593
d76fa1f3 1594 @contextlib.contextmanager
1595 def open(self, file, *, write=False):
941e881e 1596 if is_path_like(file):
d76fa1f3 1597 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1598 yield f
1599 else:
1600 if write:
1601 file.truncate(0)
1602 yield file
1603
24146491 1604 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1605 now = time.time()
1606 for cookie in self:
1607 if (not ignore_discard and cookie.discard
1608 or not ignore_expires and cookie.is_expired(now)):
1609 continue
1610 name, value = cookie.name, cookie.value
1611 if value is None:
1612 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1613 # with no name, whereas http.cookiejar regards it as a
1614 # cookie with no value.
1615 name, value = '', name
1616 f.write('%s\n' % '\t'.join((
1617 cookie.domain,
1618 self._true_or_false(cookie.domain.startswith('.')),
1619 cookie.path,
1620 self._true_or_false(cookie.secure),
1621 str_or_none(cookie.expires, default=''),
1622 name, value
1623 )))
1624
1625 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1626 """
1627 Save cookies to a file.
24146491 1628 Code is taken from CPython 3.6
1629 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1630
c380cc28
S
1631 if filename is None:
1632 if self.filename is not None:
1633 filename = self.filename
1634 else:
ac668111 1635 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1636
24146491 1637 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1638 for cookie in self:
1639 if cookie.expires is None:
1640 cookie.expires = 0
c380cc28 1641
d76fa1f3 1642 with self.open(filename, write=True) as f:
c380cc28 1643 f.write(self._HEADER)
24146491 1644 self._really_save(f, *args, **kwargs)
1bab3437
S
1645
1646 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1647 """Load cookies from a file."""
1648 if filename is None:
1649 if self.filename is not None:
1650 filename = self.filename
1651 else:
ac668111 1652 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1653
c380cc28
S
1654 def prepare_line(line):
1655 if line.startswith(self._HTTPONLY_PREFIX):
1656 line = line[len(self._HTTPONLY_PREFIX):]
1657 # comments and empty lines are fine
1658 if line.startswith('#') or not line.strip():
1659 return line
1660 cookie_list = line.split('\t')
1661 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1662 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1663 cookie = self._CookieFileEntry(*cookie_list)
1664 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1665 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1666 return line
1667
e7e62441 1668 cf = io.StringIO()
d76fa1f3 1669 with self.open(filename) as f:
e7e62441 1670 for line in f:
c380cc28
S
1671 try:
1672 cf.write(prepare_line(line))
ac668111 1673 except http.cookiejar.LoadError as e:
94aa0644 1674 if f'{line.strip()} '[0] in '[{"':
ac668111 1675 raise http.cookiejar.LoadError(
94aa0644 1676 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1677 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1678 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1679 continue
e7e62441 1680 cf.seek(0)
1681 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1682 # Session cookies are denoted by either `expires` field set to
1683 # an empty string or 0. MozillaCookieJar only recognizes the former
1684 # (see [1]). So we need force the latter to be recognized as session
1685 # cookies on our own.
1686 # Session cookies may be important for cookies-based authentication,
1687 # e.g. usually, when user does not check 'Remember me' check box while
1688 # logging in on a site, some important cookies are stored as session
1689 # cookies so that not recognizing them will result in failed login.
1690 # 1. https://bugs.python.org/issue17164
1691 for cookie in self:
1692 # Treat `expires=0` cookies as session cookies
1693 if cookie.expires == 0:
1694 cookie.expires = None
1695 cookie.discard = True
1696
1697
ac668111 1698class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1699 def __init__(self, cookiejar=None):
ac668111 1700 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1701
1702 def http_response(self, request, response):
ac668111 1703 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1704
ac668111 1705 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1706 https_response = http_response
1707
1708
ac668111 1709class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1710 """YoutubeDL redirect handler
1711
1712 The code is based on HTTPRedirectHandler implementation from CPython [1].
1713
1714 This redirect handler solves two issues:
1715 - ensures redirect URL is always unicode under python 2
1716 - introduces support for experimental HTTP response status code
1717 308 Permanent Redirect [2] used by some sites [3]
1718
1719 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1720 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1721 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1722 """
1723
ac668111 1724 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1725
1726 def redirect_request(self, req, fp, code, msg, headers, newurl):
1727 """Return a Request or None in response to a redirect.
1728
1729 This is called by the http_error_30x methods when a
1730 redirection response is received. If a redirection should
1731 take place, return a new Request to allow http_error_30x to
1732 perform the redirect. Otherwise, raise HTTPError if no-one
1733 else should try to handle this url. Return None if you can't
1734 but another Handler might.
1735 """
1736 m = req.get_method()
1737 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1738 or code in (301, 302, 303) and m == "POST")):
14f25df2 1739 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1740 # Strictly (according to RFC 2616), 301 or 302 in response to
1741 # a POST MUST NOT cause a redirection without confirmation
1742 # from the user (of urllib.request, in this case). In practice,
1743 # essentially all clients do redirect in this case, so we do
1744 # the same.
1745
201c1459 1746 # Be conciliant with URIs containing a space. This is mainly
1747 # redundant with the more complete encoding done in http_error_302(),
1748 # but it is kept for compatibility with other callers.
1749 newurl = newurl.replace(' ', '%20')
1750
1751 CONTENT_HEADERS = ("content-length", "content-type")
1752 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1753 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1754
1755 # A 303 must either use GET or HEAD for subsequent request
1756 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1757 if code == 303 and m != 'HEAD':
1758 m = 'GET'
1759 # 301 and 302 redirects are commonly turned into a GET from a POST
1760 # for subsequent requests by browsers, so we'll do the same.
1761 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1762 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1763 if code in (301, 302) and m == 'POST':
1764 m = 'GET'
1765
ac668111 1766 return urllib.request.Request(
201c1459 1767 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1768 unverifiable=True, method=m)
fca6dba8
S
1769
1770
46f59e89
S
1771def extract_timezone(date_str):
1772 m = re.search(
f137e4c2 1773 r'''(?x)
1774 ^.{8,}? # >=8 char non-TZ prefix, if present
1775 (?P<tz>Z| # just the UTC Z, or
1776 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1777 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1778 [ ]? # optional space
1779 (?P<sign>\+|-) # +/-
1780 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1781 $)
1782 ''', date_str)
46f59e89 1783 if not m:
8f53dc44 1784 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1785 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1786 if timezone is not None:
1787 date_str = date_str[:-len(m.group('tz'))]
1788 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1789 else:
1790 date_str = date_str[:-len(m.group('tz'))]
1791 if not m.group('sign'):
1792 timezone = datetime.timedelta()
1793 else:
1794 sign = 1 if m.group('sign') == '+' else -1
1795 timezone = datetime.timedelta(
1796 hours=sign * int(m.group('hours')),
1797 minutes=sign * int(m.group('minutes')))
1798 return timezone, date_str
1799
1800
08b38d54 1801def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1802 """ Return a UNIX timestamp from the given date """
1803
1804 if date_str is None:
1805 return None
1806
52c3a6e4
S
1807 date_str = re.sub(r'\.[0-9]+', '', date_str)
1808
08b38d54 1809 if timezone is None:
46f59e89
S
1810 timezone, date_str = extract_timezone(date_str)
1811
19a03940 1812 with contextlib.suppress(ValueError):
86e5f3ed 1813 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1814 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1815 return calendar.timegm(dt.timetuple())
912b38b4
PH
1816
1817
46f59e89
S
1818def date_formats(day_first=True):
1819 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1820
1821
42bdd9d0 1822def unified_strdate(date_str, day_first=True):
bf50b038 1823 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1824
1825 if date_str is None:
1826 return None
bf50b038 1827 upload_date = None
5f6a1245 1828 # Replace commas
026fcc04 1829 date_str = date_str.replace(',', ' ')
42bdd9d0 1830 # Remove AM/PM + timezone
9bb8e0a3 1831 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1832 _, date_str = extract_timezone(date_str)
42bdd9d0 1833
46f59e89 1834 for expression in date_formats(day_first):
19a03940 1835 with contextlib.suppress(ValueError):
bf50b038 1836 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1837 if upload_date is None:
1838 timetuple = email.utils.parsedate_tz(date_str)
1839 if timetuple:
19a03940 1840 with contextlib.suppress(ValueError):
c6b9cf05 1841 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1842 if upload_date is not None:
14f25df2 1843 return str(upload_date)
bf50b038 1844
5f6a1245 1845
46f59e89
S
1846def unified_timestamp(date_str, day_first=True):
1847 if date_str is None:
1848 return None
1849
8f53dc44 1850 date_str = re.sub(r'\s+', ' ', re.sub(
1851 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1852
7dc2a74e 1853 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1854 timezone, date_str = extract_timezone(date_str)
1855
1856 # Remove AM/PM + timezone
1857 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1858
deef3195
S
1859 # Remove unrecognized timezones from ISO 8601 alike timestamps
1860 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1861 if m:
1862 date_str = date_str[:-len(m.group('tz'))]
1863
f226880c
PH
1864 # Python only supports microseconds, so remove nanoseconds
1865 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1866 if m:
1867 date_str = m.group(1)
1868
46f59e89 1869 for expression in date_formats(day_first):
19a03940 1870 with contextlib.suppress(ValueError):
7dc2a74e 1871 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1872 return calendar.timegm(dt.timetuple())
8f53dc44 1873
46f59e89
S
1874 timetuple = email.utils.parsedate_tz(date_str)
1875 if timetuple:
8f53dc44 1876 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1877
1878
28e614de 1879def determine_ext(url, default_ext='unknown_video'):
85750f89 1880 if url is None or '.' not in url:
f4776371 1881 return default_ext
9cb9a5df 1882 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1883 if re.match(r'^[A-Za-z0-9]+$', guess):
1884 return guess
a7aaa398
S
1885 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1886 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1887 return guess.rstrip('/')
73e79f2a 1888 else:
cbdbb766 1889 return default_ext
73e79f2a 1890
5f6a1245 1891
824fa511
S
1892def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1893 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1894
5f6a1245 1895
9e62f283 1896def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1897 R"""
1898 Return a datetime object from a string.
1899 Supported format:
1900 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1901
1902 @param format strftime format of DATE
1903 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1904 auto: round to the unit provided in date_str (if applicable).
9e62f283 1905 """
1906 auto_precision = False
1907 if precision == 'auto':
1908 auto_precision = True
1909 precision = 'microsecond'
396a76f7 1910 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1911 if date_str in ('now', 'today'):
37254abc 1912 return today
f8795e10
PH
1913 if date_str == 'yesterday':
1914 return today - datetime.timedelta(days=1)
9e62f283 1915 match = re.match(
3d38b2d6 1916 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1917 date_str)
37254abc 1918 if match is not None:
9e62f283 1919 start_time = datetime_from_str(match.group('start'), precision, format)
1920 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1921 unit = match.group('unit')
9e62f283 1922 if unit == 'month' or unit == 'year':
1923 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1924 unit = 'day'
9e62f283 1925 else:
1926 if unit == 'week':
1927 unit = 'day'
1928 time *= 7
1929 delta = datetime.timedelta(**{unit + 's': time})
1930 new_date = start_time + delta
1931 if auto_precision:
1932 return datetime_round(new_date, unit)
1933 return new_date
1934
1935 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1936
1937
d49f8db3 1938def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1939 R"""
1940 Return a date object from a string using datetime_from_str
9e62f283 1941
3d38b2d6 1942 @param strict Restrict allowed patterns to "YYYYMMDD" and
1943 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1944 """
3d38b2d6 1945 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1946 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1947 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1948
1949
1950def datetime_add_months(dt, months):
1951 """Increment/Decrement a datetime object by months."""
1952 month = dt.month + months - 1
1953 year = dt.year + month // 12
1954 month = month % 12 + 1
1955 day = min(dt.day, calendar.monthrange(year, month)[1])
1956 return dt.replace(year, month, day)
1957
1958
1959def datetime_round(dt, precision='day'):
1960 """
1961 Round a datetime object's time to a specific precision
1962 """
1963 if precision == 'microsecond':
1964 return dt
1965
1966 unit_seconds = {
1967 'day': 86400,
1968 'hour': 3600,
1969 'minute': 60,
1970 'second': 1,
1971 }
1972 roundto = lambda x, n: ((x + n / 2) // n) * n
1973 timestamp = calendar.timegm(dt.timetuple())
1974 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1975
1976
e63fc1be 1977def hyphenate_date(date_str):
1978 """
1979 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1980 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1981 if match is not None:
1982 return '-'.join(match.groups())
1983 else:
1984 return date_str
1985
5f6a1245 1986
86e5f3ed 1987class DateRange:
bd558525 1988 """Represents a time interval between two dates"""
5f6a1245 1989
bd558525
JMF
1990 def __init__(self, start=None, end=None):
1991 """start and end must be strings in the format accepted by date"""
1992 if start is not None:
d49f8db3 1993 self.start = date_from_str(start, strict=True)
bd558525
JMF
1994 else:
1995 self.start = datetime.datetime.min.date()
1996 if end is not None:
d49f8db3 1997 self.end = date_from_str(end, strict=True)
bd558525
JMF
1998 else:
1999 self.end = datetime.datetime.max.date()
37254abc 2000 if self.start > self.end:
bd558525 2001 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 2002
bd558525
JMF
2003 @classmethod
2004 def day(cls, day):
2005 """Returns a range that only contains the given day"""
5f6a1245
JW
2006 return cls(day, day)
2007
bd558525
JMF
2008 def __contains__(self, date):
2009 """Check if the date is in the range"""
37254abc
JMF
2010 if not isinstance(date, datetime.date):
2011 date = date_from_str(date)
2012 return self.start <= date <= self.end
5f6a1245 2013
bd558525 2014 def __str__(self):
86e5f3ed 2015 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 2016
f2df4071 2017 def __eq__(self, other):
2018 return (isinstance(other, DateRange)
2019 and self.start == other.start and self.end == other.end)
2020
c496ca96
PH
2021
2022def platform_name():
14f25df2 2023 """ Returns the platform name as a str """
da4db748 2024 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
b1f94422 2025 return platform.platform()
c496ca96 2026
b1f94422 2027
2028@functools.cache
2029def system_identifier():
2030 python_implementation = platform.python_implementation()
2031 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2032 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 2033 libc_ver = []
2034 with contextlib.suppress(OSError): # We may not have access to the executable
2035 libc_ver = platform.libc_ver()
b1f94422 2036
17fc3dc4 2037 return 'Python %s (%s %s %s) - %s (%s%s)' % (
b1f94422 2038 platform.python_version(),
2039 python_implementation,
17fc3dc4 2040 platform.machine(),
b1f94422 2041 platform.architecture()[0],
2042 platform.platform(),
5b9f253f
M
2043 ssl.OPENSSL_VERSION,
2044 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
b1f94422 2045 )
c257baff
PH
2046
2047
0b9c08b4 2048@functools.cache
49fa4d9a 2049def get_windows_version():
8a82af35 2050 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
2051 if compat_os_name == 'nt':
2052 return version_tuple(platform.win32_ver()[1])
2053 else:
8a82af35 2054 return ()
49fa4d9a
N
2055
2056
734f90bb 2057def write_string(s, out=None, encoding=None):
19a03940 2058 assert isinstance(s, str)
2059 out = out or sys.stderr
3b479100
SS
2060 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
2061 if not out:
2062 return
7459e3a2 2063
fe1daad3 2064 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 2065 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 2066
8a82af35 2067 enc, buffer = None, out
cfb0511d 2068 if 'b' in getattr(out, 'mode', ''):
c487cf00 2069 enc = encoding or preferredencoding()
104aa738 2070 elif hasattr(out, 'buffer'):
8a82af35 2071 buffer = out.buffer
104aa738 2072 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2073
8a82af35 2074 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2075 out.flush()
2076
2077
da4db748 2078def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2079 from . import _IN_CLI
2080 if _IN_CLI:
2081 if msg in deprecation_warning._cache:
2082 return
2083 deprecation_warning._cache.add(msg)
2084 if printer:
2085 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2086 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2087 else:
2088 import warnings
2089 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2090
2091
2092deprecation_warning._cache = set()
2093
2094
48ea9cea
PH
2095def bytes_to_intlist(bs):
2096 if not bs:
2097 return []
2098 if isinstance(bs[0], int): # Python 3
2099 return list(bs)
2100 else:
2101 return [ord(c) for c in bs]
2102
c257baff 2103
cba892fa 2104def intlist_to_bytes(xs):
2105 if not xs:
2106 return b''
ac668111 2107 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2108
2109
8a82af35 2110class LockingUnsupportedError(OSError):
1890fc63 2111 msg = 'File locking is not supported'
0edb3e33 2112
2113 def __init__(self):
2114 super().__init__(self.msg)
2115
2116
c1c9a79c
PH
2117# Cross-platform file locking
2118if sys.platform == 'win32':
fe0918bb 2119 import ctypes
c1c9a79c
PH
2120 import ctypes.wintypes
2121 import msvcrt
2122
2123 class OVERLAPPED(ctypes.Structure):
2124 _fields_ = [
2125 ('Internal', ctypes.wintypes.LPVOID),
2126 ('InternalHigh', ctypes.wintypes.LPVOID),
2127 ('Offset', ctypes.wintypes.DWORD),
2128 ('OffsetHigh', ctypes.wintypes.DWORD),
2129 ('hEvent', ctypes.wintypes.HANDLE),
2130 ]
2131
37e325b9 2132 kernel32 = ctypes.WinDLL('kernel32')
c1c9a79c
PH
2133 LockFileEx = kernel32.LockFileEx
2134 LockFileEx.argtypes = [
2135 ctypes.wintypes.HANDLE, # hFile
2136 ctypes.wintypes.DWORD, # dwFlags
2137 ctypes.wintypes.DWORD, # dwReserved
2138 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2139 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2140 ctypes.POINTER(OVERLAPPED) # Overlapped
2141 ]
2142 LockFileEx.restype = ctypes.wintypes.BOOL
2143 UnlockFileEx = kernel32.UnlockFileEx
2144 UnlockFileEx.argtypes = [
2145 ctypes.wintypes.HANDLE, # hFile
2146 ctypes.wintypes.DWORD, # dwReserved
2147 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2148 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2149 ctypes.POINTER(OVERLAPPED) # Overlapped
2150 ]
2151 UnlockFileEx.restype = ctypes.wintypes.BOOL
2152 whole_low = 0xffffffff
2153 whole_high = 0x7fffffff
2154
747c0bd1 2155 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2156 overlapped = OVERLAPPED()
2157 overlapped.Offset = 0
2158 overlapped.OffsetHigh = 0
2159 overlapped.hEvent = 0
2160 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2161
2162 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2163 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2164 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2165 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2166 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2167
2168 def _unlock_file(f):
2169 assert f._lock_file_overlapped_p
2170 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2171 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2172 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2173
2174else:
399a76e6
YCH
2175 try:
2176 import fcntl
c1c9a79c 2177
a3125791 2178 def _lock_file(f, exclusive, block):
b63837bc 2179 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2180 if not block:
2181 flags |= fcntl.LOCK_NB
acea8d7c 2182 try:
b63837bc 2183 fcntl.flock(f, flags)
acea8d7c
JK
2184 except BlockingIOError:
2185 raise
2186 except OSError: # AOSP does not have flock()
b63837bc 2187 fcntl.lockf(f, flags)
c1c9a79c 2188
399a76e6 2189 def _unlock_file(f):
acea8d7c
JK
2190 try:
2191 fcntl.flock(f, fcntl.LOCK_UN)
2192 except OSError:
2193 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2194
399a76e6 2195 except ImportError:
399a76e6 2196
a3125791 2197 def _lock_file(f, exclusive, block):
0edb3e33 2198 raise LockingUnsupportedError()
399a76e6
YCH
2199
2200 def _unlock_file(f):
0edb3e33 2201 raise LockingUnsupportedError()
c1c9a79c
PH
2202
2203
86e5f3ed 2204class locked_file:
0edb3e33 2205 locked = False
747c0bd1 2206
a3125791 2207 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2208 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2209 raise NotImplementedError(mode)
2210 self.mode, self.block = mode, block
2211
2212 writable = any(f in mode for f in 'wax+')
2213 readable = any(f in mode for f in 'r+')
2214 flags = functools.reduce(operator.ior, (
2215 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2216 getattr(os, 'O_BINARY', 0), # Windows only
2217 getattr(os, 'O_NOINHERIT', 0), # Windows only
2218 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2219 os.O_APPEND if 'a' in mode else 0,
2220 os.O_EXCL if 'x' in mode else 0,
2221 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2222 ))
2223
98804d03 2224 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2225
2226 def __enter__(self):
a3125791 2227 exclusive = 'r' not in self.mode
c1c9a79c 2228 try:
a3125791 2229 _lock_file(self.f, exclusive, self.block)
0edb3e33 2230 self.locked = True
86e5f3ed 2231 except OSError:
c1c9a79c
PH
2232 self.f.close()
2233 raise
fcfa8853 2234 if 'w' in self.mode:
131e14dc
JK
2235 try:
2236 self.f.truncate()
2237 except OSError as e:
1890fc63 2238 if e.errno not in (
2239 errno.ESPIPE, # Illegal seek - expected for FIFO
2240 errno.EINVAL, # Invalid argument - expected for /dev/null
2241 ):
2242 raise
c1c9a79c
PH
2243 return self
2244
0edb3e33 2245 def unlock(self):
2246 if not self.locked:
2247 return
c1c9a79c 2248 try:
0edb3e33 2249 _unlock_file(self.f)
c1c9a79c 2250 finally:
0edb3e33 2251 self.locked = False
c1c9a79c 2252
0edb3e33 2253 def __exit__(self, *_):
2254 try:
2255 self.unlock()
2256 finally:
2257 self.f.close()
4eb7f1d1 2258
0edb3e33 2259 open = __enter__
2260 close = __exit__
a3125791 2261
0edb3e33 2262 def __getattr__(self, attr):
2263 return getattr(self.f, attr)
a3125791 2264
0edb3e33 2265 def __iter__(self):
2266 return iter(self.f)
a3125791 2267
4eb7f1d1 2268
0b9c08b4 2269@functools.cache
4644ac55
S
2270def get_filesystem_encoding():
2271 encoding = sys.getfilesystemencoding()
2272 return encoding if encoding is not None else 'utf-8'
2273
2274
4eb7f1d1 2275def shell_quote(args):
a6a173c2 2276 quoted_args = []
4644ac55 2277 encoding = get_filesystem_encoding()
a6a173c2
JMF
2278 for a in args:
2279 if isinstance(a, bytes):
2280 # We may get a filename encoded with 'encodeFilename'
2281 a = a.decode(encoding)
aefce8e6 2282 quoted_args.append(compat_shlex_quote(a))
28e614de 2283 return ' '.join(quoted_args)
9d4660ca
PH
2284
2285
2286def smuggle_url(url, data):
2287 """ Pass additional data in a URL for internal use. """
2288
81953d1a
RA
2289 url, idata = unsmuggle_url(url, {})
2290 data.update(idata)
14f25df2 2291 sdata = urllib.parse.urlencode(
28e614de
PH
2292 {'__youtubedl_smuggle': json.dumps(data)})
2293 return url + '#' + sdata
9d4660ca
PH
2294
2295
79f82953 2296def unsmuggle_url(smug_url, default=None):
83e865a3 2297 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2298 return smug_url, default
28e614de 2299 url, _, sdata = smug_url.rpartition('#')
14f25df2 2300 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2301 data = json.loads(jsond)
2302 return url, data
02dbf93f
PH
2303
2304
e0fd9573 2305def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2306 """ Formats numbers with decimal sufixes like K, M, etc """
2307 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2308 if num is None or num < 0:
e0fd9573 2309 return None
eeb2a770 2310 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2311 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2312 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2313 if factor == 1024:
2314 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2315 converted = num / (factor ** exponent)
abbeeebc 2316 return fmt % (converted, suffix)
e0fd9573 2317
2318
02dbf93f 2319def format_bytes(bytes):
f02d24d8 2320 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2321
1c088fa8 2322
64c464a1 2323def lookup_unit_table(unit_table, s, strict=False):
2324 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
fb47597b 2325 units_re = '|'.join(re.escape(u) for u in unit_table)
64c464a1 2326 m = (re.fullmatch if strict else re.match)(
2327 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
fb47597b
S
2328 if not m:
2329 return None
64c464a1 2330
2331 num = float(m.group('num').replace(',', '.'))
fb47597b 2332 mult = unit_table[m.group('unit')]
64c464a1 2333 return round(num * mult)
2334
2335
2336def parse_bytes(s):
2337 """Parse a string indicating a byte quantity into an integer"""
2338 return lookup_unit_table(
2339 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2340 s.upper(), strict=True)
fb47597b
S
2341
2342
be64b5b0
PH
2343def parse_filesize(s):
2344 if s is None:
2345 return None
2346
dfb1b146 2347 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2348 # but we support those too
2349 _UNIT_TABLE = {
2350 'B': 1,
2351 'b': 1,
70852b47 2352 'bytes': 1,
be64b5b0
PH
2353 'KiB': 1024,
2354 'KB': 1000,
2355 'kB': 1024,
2356 'Kb': 1000,
13585d76 2357 'kb': 1000,
70852b47
YCH
2358 'kilobytes': 1000,
2359 'kibibytes': 1024,
be64b5b0
PH
2360 'MiB': 1024 ** 2,
2361 'MB': 1000 ** 2,
2362 'mB': 1024 ** 2,
2363 'Mb': 1000 ** 2,
13585d76 2364 'mb': 1000 ** 2,
70852b47
YCH
2365 'megabytes': 1000 ** 2,
2366 'mebibytes': 1024 ** 2,
be64b5b0
PH
2367 'GiB': 1024 ** 3,
2368 'GB': 1000 ** 3,
2369 'gB': 1024 ** 3,
2370 'Gb': 1000 ** 3,
13585d76 2371 'gb': 1000 ** 3,
70852b47
YCH
2372 'gigabytes': 1000 ** 3,
2373 'gibibytes': 1024 ** 3,
be64b5b0
PH
2374 'TiB': 1024 ** 4,
2375 'TB': 1000 ** 4,
2376 'tB': 1024 ** 4,
2377 'Tb': 1000 ** 4,
13585d76 2378 'tb': 1000 ** 4,
70852b47
YCH
2379 'terabytes': 1000 ** 4,
2380 'tebibytes': 1024 ** 4,
be64b5b0
PH
2381 'PiB': 1024 ** 5,
2382 'PB': 1000 ** 5,
2383 'pB': 1024 ** 5,
2384 'Pb': 1000 ** 5,
13585d76 2385 'pb': 1000 ** 5,
70852b47
YCH
2386 'petabytes': 1000 ** 5,
2387 'pebibytes': 1024 ** 5,
be64b5b0
PH
2388 'EiB': 1024 ** 6,
2389 'EB': 1000 ** 6,
2390 'eB': 1024 ** 6,
2391 'Eb': 1000 ** 6,
13585d76 2392 'eb': 1000 ** 6,
70852b47
YCH
2393 'exabytes': 1000 ** 6,
2394 'exbibytes': 1024 ** 6,
be64b5b0
PH
2395 'ZiB': 1024 ** 7,
2396 'ZB': 1000 ** 7,
2397 'zB': 1024 ** 7,
2398 'Zb': 1000 ** 7,
13585d76 2399 'zb': 1000 ** 7,
70852b47
YCH
2400 'zettabytes': 1000 ** 7,
2401 'zebibytes': 1024 ** 7,
be64b5b0
PH
2402 'YiB': 1024 ** 8,
2403 'YB': 1000 ** 8,
2404 'yB': 1024 ** 8,
2405 'Yb': 1000 ** 8,
13585d76 2406 'yb': 1000 ** 8,
70852b47
YCH
2407 'yottabytes': 1000 ** 8,
2408 'yobibytes': 1024 ** 8,
be64b5b0
PH
2409 }
2410
fb47597b
S
2411 return lookup_unit_table(_UNIT_TABLE, s)
2412
2413
2414def parse_count(s):
2415 if s is None:
be64b5b0
PH
2416 return None
2417
352d5da8 2418 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2419
2420 if re.match(r'^[\d,.]+$', s):
2421 return str_to_int(s)
2422
2423 _UNIT_TABLE = {
2424 'k': 1000,
2425 'K': 1000,
2426 'm': 1000 ** 2,
2427 'M': 1000 ** 2,
2428 'kk': 1000 ** 2,
2429 'KK': 1000 ** 2,
352d5da8 2430 'b': 1000 ** 3,
2431 'B': 1000 ** 3,
fb47597b 2432 }
be64b5b0 2433
352d5da8 2434 ret = lookup_unit_table(_UNIT_TABLE, s)
2435 if ret is not None:
2436 return ret
2437
2438 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2439 if mobj:
2440 return str_to_int(mobj.group(1))
be64b5b0 2441
2f7ae819 2442
5d45484c 2443def parse_resolution(s, *, lenient=False):
b871d7e9
S
2444 if s is None:
2445 return {}
2446
5d45484c
LNO
2447 if lenient:
2448 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2449 else:
2450 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2451 if mobj:
2452 return {
2453 'width': int(mobj.group('w')),
2454 'height': int(mobj.group('h')),
2455 }
2456
17ec8bcf 2457 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2458 if mobj:
2459 return {'height': int(mobj.group(1))}
2460
2461 mobj = re.search(r'\b([48])[kK]\b', s)
2462 if mobj:
2463 return {'height': int(mobj.group(1)) * 540}
2464
2465 return {}
2466
2467
0dc41787 2468def parse_bitrate(s):
14f25df2 2469 if not isinstance(s, str):
0dc41787
S
2470 return
2471 mobj = re.search(r'\b(\d+)\s*kbps', s)
2472 if mobj:
2473 return int(mobj.group(1))
2474
2475
a942d6cb 2476def month_by_name(name, lang='en'):
caefb1de
PH
2477 """ Return the number of a month by (locale-independently) English name """
2478
f6717dec 2479 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2480
caefb1de 2481 try:
f6717dec 2482 return month_names.index(name) + 1
7105440c
YCH
2483 except ValueError:
2484 return None
2485
2486
2487def month_by_abbreviation(abbrev):
2488 """ Return the number of a month by (locale-independently) English
2489 abbreviations """
2490
2491 try:
2492 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2493 except ValueError:
2494 return None
18258362
JMF
2495
2496
5aafe895 2497def fix_xml_ampersands(xml_str):
18258362 2498 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2499 return re.sub(
2500 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2501 '&amp;',
5aafe895 2502 xml_str)
e3946f98
PH
2503
2504
2505def setproctitle(title):
14f25df2 2506 assert isinstance(title, str)
c1c05c67 2507
fe0918bb 2508 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2509 try:
2510 import ctypes
2511 except ImportError:
c1c05c67
YCH
2512 return
2513
e3946f98 2514 try:
611c1dd9 2515 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2516 except OSError:
2517 return
2f49bcd6
RC
2518 except TypeError:
2519 # LoadLibrary in Windows Python 2.7.13 only expects
2520 # a bytestring, but since unicode_literals turns
2521 # every string into a unicode string, it fails.
2522 return
0f06bcd7 2523 title_bytes = title.encode()
6eefe533
PH
2524 buf = ctypes.create_string_buffer(len(title_bytes))
2525 buf.value = title_bytes
e3946f98 2526 try:
6eefe533 2527 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2528 except AttributeError:
2529 return # Strange libc, just skip this
d7dda168
PH
2530
2531
2532def remove_start(s, start):
46bc9b7d 2533 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2534
2535
2b9faf55 2536def remove_end(s, end):
46bc9b7d 2537 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2538
2539
31b2051e
S
2540def remove_quotes(s):
2541 if s is None or len(s) < 2:
2542 return s
2543 for quote in ('"', "'", ):
2544 if s[0] == quote and s[-1] == quote:
2545 return s[1:-1]
2546 return s
2547
2548
b6e0c7d2 2549def get_domain(url):
ebf99aaf 2550 """
2551 This implementation is inconsistent, but is kept for compatibility.
2552 Use this only for "webpage_url_domain"
2553 """
2554 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2555
2556
29eb5174 2557def url_basename(url):
14f25df2 2558 path = urllib.parse.urlparse(url).path
28e614de 2559 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2560
2561
02dc0a36 2562def base_url(url):
7657ec7e 2563 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2564
2565
e34c3361 2566def urljoin(base, path):
4b5de77b 2567 if isinstance(path, bytes):
0f06bcd7 2568 path = path.decode()
14f25df2 2569 if not isinstance(path, str) or not path:
e34c3361 2570 return None
fad4ceb5 2571 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2572 return path
4b5de77b 2573 if isinstance(base, bytes):
0f06bcd7 2574 base = base.decode()
14f25df2 2575 if not isinstance(base, str) or not re.match(
4b5de77b 2576 r'^(?:https?:)?//', base):
e34c3361 2577 return None
14f25df2 2578 return urllib.parse.urljoin(base, path)
e34c3361
S
2579
2580
ac668111 2581class HEADRequest(urllib.request.Request):
aa94a6d3 2582 def get_method(self):
611c1dd9 2583 return 'HEAD'
7217e148
PH
2584
2585
ac668111 2586class PUTRequest(urllib.request.Request):
95cf60e8
S
2587 def get_method(self):
2588 return 'PUT'
2589
2590
9732d77e 2591def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2592 if get_attr and v is not None:
2593 v = getattr(v, get_attr, None)
1812afb7
S
2594 try:
2595 return int(v) * invscale // scale
31c49255 2596 except (ValueError, TypeError, OverflowError):
af98f8ff 2597 return default
9732d77e 2598
9572013d 2599
40a90862 2600def str_or_none(v, default=None):
14f25df2 2601 return default if v is None else str(v)
40a90862 2602
9732d77e
PH
2603
2604def str_to_int(int_str):
48d4681e 2605 """ A more relaxed version of int_or_none """
f9934b96 2606 if isinstance(int_str, int):
348c6bf1 2607 return int_str
14f25df2 2608 elif isinstance(int_str, str):
42db58ec
S
2609 int_str = re.sub(r'[,\.\+]', '', int_str)
2610 return int_or_none(int_str)
608d11f5
PH
2611
2612
9732d77e 2613def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2614 if v is None:
2615 return default
2616 try:
2617 return float(v) * invscale / scale
5e1271c5 2618 except (ValueError, TypeError):
caf80631 2619 return default
43f775e4
PH
2620
2621
c7e327c4
S
2622def bool_or_none(v, default=None):
2623 return v if isinstance(v, bool) else default
2624
2625
53cd37ba 2626def strip_or_none(v, default=None):
14f25df2 2627 return v.strip() if isinstance(v, str) else default
b72b4431
S
2628
2629
af03000a 2630def url_or_none(url):
14f25df2 2631 if not url or not isinstance(url, str):
af03000a
S
2632 return None
2633 url = url.strip()
29f7c58a 2634 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2635
2636
3e9b66d7 2637def request_to_url(req):
ac668111 2638 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2639 return req.get_full_url()
2640 else:
2641 return req
2642
2643
e29663c6 2644def strftime_or_none(timestamp, date_format, default=None):
2645 datetime_object = None
2646 try:
f9934b96 2647 if isinstance(timestamp, (int, float)): # unix timestamp
d509c1f5 2648 # Using naive datetime here can break timestamp() in Windows
2649 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2650 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
14f25df2 2651 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2652 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2653 date_format = re.sub( # Support %s on windows
2654 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2655 return datetime_object.strftime(date_format)
2656 except (ValueError, TypeError, AttributeError):
2657 return default
2658
2659
608d11f5 2660def parse_duration(s):
f9934b96 2661 if not isinstance(s, str):
608d11f5 2662 return None
ca7b3246 2663 s = s.strip()
38d79fd1 2664 if not s:
2665 return None
ca7b3246 2666
acaff495 2667 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2668 m = re.match(r'''(?x)
2669 (?P<before_secs>
2670 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2671 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2672 (?P<ms>[.:][0-9]+)?Z?$
2673 ''', s)
acaff495 2674 if m:
8bd1c00b 2675 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2676 else:
2677 m = re.match(
056653bb
S
2678 r'''(?ix)(?:P?
2679 (?:
1c1b2f96 2680 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2681 )?
2682 (?:
1c1b2f96 2683 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2684 )?
2685 (?:
1c1b2f96 2686 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2687 )?
8f4b58d7 2688 (?:
1c1b2f96 2689 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2690 )?
056653bb 2691 T)?
acaff495 2692 (?:
1c1b2f96 2693 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2694 )?
2695 (?:
1c1b2f96 2696 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2697 )?
2698 (?:
2699 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2700 )?Z?$''', s)
acaff495 2701 if m:
2702 days, hours, mins, secs, ms = m.groups()
2703 else:
15846398 2704 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2705 if m:
2706 hours, mins = m.groups()
2707 else:
2708 return None
2709
acaff495 2710 if ms:
19a03940 2711 ms = ms.replace(':', '.')
2712 return sum(float(part or 0) * mult for part, mult in (
2713 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2714
2715
e65e4c88 2716def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2717 name, real_ext = os.path.splitext(filename)
e65e4c88 2718 return (
86e5f3ed 2719 f'{name}.{ext}{real_ext}'
e65e4c88 2720 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2721 else f'{filename}.{ext}')
d70ad093
PH
2722
2723
b3ed15b7
S
2724def replace_extension(filename, ext, expected_real_ext=None):
2725 name, real_ext = os.path.splitext(filename)
86e5f3ed 2726 return '{}.{}'.format(
b3ed15b7
S
2727 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2728 ext)
2729
2730
d70ad093
PH
2731def check_executable(exe, args=[]):
2732 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2733 args can be a list of arguments for a short output (like -version) """
2734 try:
f0c9fb96 2735 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2736 except OSError:
2737 return False
2738 return exe
b7ab0590
PH
2739
2740
7aaf4cd2 2741def _get_exe_version_output(exe, args):
95807118 2742 try:
b64d04c1 2743 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2744 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2745 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
1cdda329 2746 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2747 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2748 if ret:
2749 return None
95807118
PH
2750 except OSError:
2751 return False
f0c9fb96 2752 return stdout
cae97f65
PH
2753
2754
2755def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2756 assert isinstance(output, str)
cae97f65
PH
2757 if version_re is None:
2758 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2759 m = re.search(version_re, output)
95807118
PH
2760 if m:
2761 return m.group(1)
2762 else:
2763 return unrecognized
2764
2765
9af98e17 2766def get_exe_version(exe, args=['--version'],
1cdda329 2767 version_re=None, unrecognized=('present', 'broken')):
9af98e17 2768 """ Returns the version of the specified executable,
2769 or False if the executable is not present """
1cdda329 2770 unrecognized = variadic(unrecognized)
2771 assert len(unrecognized) in (1, 2)
9af98e17 2772 out = _get_exe_version_output(exe, args)
1cdda329 2773 if out is None:
2774 return unrecognized[-1]
2775 return out and detect_exe_version(out, version_re, unrecognized[0])
9af98e17 2776
2777
7e88d7d7 2778def frange(start=0, stop=None, step=1):
2779 """Float range"""
2780 if stop is None:
2781 start, stop = 0, start
2782 sign = [-1, 1][step > 0] if step else 0
2783 while sign * start < sign * stop:
2784 yield start
2785 start += step
2786
2787
cb89cfc1 2788class LazyList(collections.abc.Sequence):
0f06bcd7 2789 """Lazy immutable list from an iterable
2790 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2791
8e5fecc8 2792 class IndexError(IndexError):
2793 pass
2794
282f5709 2795 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2796 self._iterable = iter(iterable)
2797 self._cache = [] if _cache is None else _cache
2798 self._reversed = reverse
483336e7 2799
2800 def __iter__(self):
0f06bcd7 2801 if self._reversed:
28419ca2 2802 # We need to consume the entire iterable to iterate in reverse
981052c9 2803 yield from self.exhaust()
28419ca2 2804 return
0f06bcd7 2805 yield from self._cache
2806 for item in self._iterable:
2807 self._cache.append(item)
483336e7 2808 yield item
2809
0f06bcd7 2810 def _exhaust(self):
2811 self._cache.extend(self._iterable)
2812 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2813 return self._cache
28419ca2 2814
981052c9 2815 def exhaust(self):
0f06bcd7 2816 """Evaluate the entire iterable"""
2817 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2818
28419ca2 2819 @staticmethod
0f06bcd7 2820 def _reverse_index(x):
f2df4071 2821 return None if x is None else ~x
483336e7 2822
2823 def __getitem__(self, idx):
2824 if isinstance(idx, slice):
0f06bcd7 2825 if self._reversed:
2826 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2827 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2828 elif isinstance(idx, int):
0f06bcd7 2829 if self._reversed:
2830 idx = self._reverse_index(idx)
e0f2b4b4 2831 start, stop, step = idx, idx, 0
483336e7 2832 else:
2833 raise TypeError('indices must be integers or slices')
e0f2b4b4 2834 if ((start or 0) < 0 or (stop or 0) < 0
2835 or (start is None and step < 0)
2836 or (stop is None and step > 0)):
483336e7 2837 # We need to consume the entire iterable to be able to slice from the end
2838 # Obviously, never use this with infinite iterables
0f06bcd7 2839 self._exhaust()
8e5fecc8 2840 try:
0f06bcd7 2841 return self._cache[idx]
8e5fecc8 2842 except IndexError as e:
2843 raise self.IndexError(e) from e
0f06bcd7 2844 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2845 if n > 0:
0f06bcd7 2846 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2847 try:
0f06bcd7 2848 return self._cache[idx]
8e5fecc8 2849 except IndexError as e:
2850 raise self.IndexError(e) from e
483336e7 2851
2852 def __bool__(self):
2853 try:
0f06bcd7 2854 self[-1] if self._reversed else self[0]
8e5fecc8 2855 except self.IndexError:
483336e7 2856 return False
2857 return True
2858
2859 def __len__(self):
0f06bcd7 2860 self._exhaust()
2861 return len(self._cache)
483336e7 2862
282f5709 2863 def __reversed__(self):
0f06bcd7 2864 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2865
2866 def __copy__(self):
0f06bcd7 2867 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2868
28419ca2 2869 def __repr__(self):
2870 # repr and str should mimic a list. So we exhaust the iterable
2871 return repr(self.exhaust())
2872
2873 def __str__(self):
2874 return repr(self.exhaust())
2875
483336e7 2876
7be9ccff 2877class PagedList:
c07a39ae 2878
2879 class IndexError(IndexError):
2880 pass
2881
dd26ced1
PH
2882 def __len__(self):
2883 # This is only useful for tests
2884 return len(self.getslice())
2885
7be9ccff 2886 def __init__(self, pagefunc, pagesize, use_cache=True):
2887 self._pagefunc = pagefunc
2888 self._pagesize = pagesize
f1d13090 2889 self._pagecount = float('inf')
7be9ccff 2890 self._use_cache = use_cache
2891 self._cache = {}
2892
2893 def getpage(self, pagenum):
d8cf8d97 2894 page_results = self._cache.get(pagenum)
2895 if page_results is None:
f1d13090 2896 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2897 if self._use_cache:
2898 self._cache[pagenum] = page_results
2899 return page_results
2900
2901 def getslice(self, start=0, end=None):
2902 return list(self._getslice(start, end))
2903
2904 def _getslice(self, start, end):
55575225 2905 raise NotImplementedError('This method must be implemented by subclasses')
2906
2907 def __getitem__(self, idx):
f1d13090 2908 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2909 if not isinstance(idx, int) or idx < 0:
2910 raise TypeError('indices must be non-negative integers')
2911 entries = self.getslice(idx, idx + 1)
d8cf8d97 2912 if not entries:
c07a39ae 2913 raise self.IndexError()
d8cf8d97 2914 return entries[0]
55575225 2915
9c44d242
PH
2916
2917class OnDemandPagedList(PagedList):
a44ca5a4 2918 """Download pages until a page with less than maximum results"""
86e5f3ed 2919
7be9ccff 2920 def _getslice(self, start, end):
b7ab0590
PH
2921 for pagenum in itertools.count(start // self._pagesize):
2922 firstid = pagenum * self._pagesize
2923 nextfirstid = pagenum * self._pagesize + self._pagesize
2924 if start >= nextfirstid:
2925 continue
2926
b7ab0590
PH
2927 startv = (
2928 start % self._pagesize
2929 if firstid <= start < nextfirstid
2930 else 0)
b7ab0590
PH
2931 endv = (
2932 ((end - 1) % self._pagesize) + 1
2933 if (end is not None and firstid <= end <= nextfirstid)
2934 else None)
2935
f1d13090 2936 try:
2937 page_results = self.getpage(pagenum)
2938 except Exception:
2939 self._pagecount = pagenum - 1
2940 raise
b7ab0590
PH
2941 if startv != 0 or endv is not None:
2942 page_results = page_results[startv:endv]
7be9ccff 2943 yield from page_results
b7ab0590
PH
2944
2945 # A little optimization - if current page is not "full", ie. does
2946 # not contain page_size videos then we can assume that this page
2947 # is the last one - there are no more ids on further pages -
2948 # i.e. no need to query again.
2949 if len(page_results) + startv < self._pagesize:
2950 break
2951
2952 # If we got the whole page, but the next page is not interesting,
2953 # break out early as well
2954 if end == nextfirstid:
2955 break
81c2f20b
PH
2956
2957
9c44d242 2958class InAdvancePagedList(PagedList):
a44ca5a4 2959 """PagedList with total number of pages known in advance"""
86e5f3ed 2960
9c44d242 2961 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2962 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2963 self._pagecount = pagecount
9c44d242 2964
7be9ccff 2965 def _getslice(self, start, end):
9c44d242 2966 start_page = start // self._pagesize
d37707bd 2967 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2968 skip_elems = start - start_page * self._pagesize
2969 only_more = None if end is None else end - start
2970 for pagenum in range(start_page, end_page):
7be9ccff 2971 page_results = self.getpage(pagenum)
9c44d242 2972 if skip_elems:
7be9ccff 2973 page_results = page_results[skip_elems:]
9c44d242
PH
2974 skip_elems = None
2975 if only_more is not None:
7be9ccff 2976 if len(page_results) < only_more:
2977 only_more -= len(page_results)
9c44d242 2978 else:
7be9ccff 2979 yield from page_results[:only_more]
9c44d242 2980 break
7be9ccff 2981 yield from page_results
9c44d242
PH
2982
2983
7e88d7d7 2984class PlaylistEntries:
2985 MissingEntry = object()
2986 is_exhausted = False
2987
2988 def __init__(self, ydl, info_dict):
7e9a6125 2989 self.ydl = ydl
2990
2991 # _entries must be assigned now since infodict can change during iteration
2992 entries = info_dict.get('entries')
2993 if entries is None:
2994 raise EntryNotInPlaylist('There are no entries')
2995 elif isinstance(entries, list):
2996 self.is_exhausted = True
2997
2998 requested_entries = info_dict.get('requested_entries')
bc5c2f8a 2999 self.is_incomplete = requested_entries is not None
7e9a6125 3000 if self.is_incomplete:
3001 assert self.is_exhausted
bc5c2f8a 3002 self._entries = [self.MissingEntry] * max(requested_entries or [0])
7e9a6125 3003 for i, entry in zip(requested_entries, entries):
3004 self._entries[i - 1] = entry
3005 elif isinstance(entries, (list, PagedList, LazyList)):
3006 self._entries = entries
3007 else:
3008 self._entries = LazyList(entries)
7e88d7d7 3009
3010 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
3011 (?P<start>[+-]?\d+)?
3012 (?P<range>[:-]
3013 (?P<end>[+-]?\d+|inf(?:inite)?)?
3014 (?::(?P<step>[+-]?\d+))?
3015 )?''')
3016
3017 @classmethod
3018 def parse_playlist_items(cls, string):
3019 for segment in string.split(','):
3020 if not segment:
3021 raise ValueError('There is two or more consecutive commas')
3022 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
3023 if not mobj:
3024 raise ValueError(f'{segment!r} is not a valid specification')
3025 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
3026 if int_or_none(step) == 0:
3027 raise ValueError(f'Step in {segment!r} cannot be zero')
3028 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3029
3030 def get_requested_items(self):
3031 playlist_items = self.ydl.params.get('playlist_items')
3032 playlist_start = self.ydl.params.get('playliststart', 1)
3033 playlist_end = self.ydl.params.get('playlistend')
3034 # For backwards compatibility, interpret -1 as whole list
3035 if playlist_end in (-1, None):
3036 playlist_end = ''
3037 if not playlist_items:
3038 playlist_items = f'{playlist_start}:{playlist_end}'
3039 elif playlist_start != 1 or playlist_end:
3040 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3041
3042 for index in self.parse_playlist_items(playlist_items):
3043 for i, entry in self[index]:
3044 yield i, entry
1ac4fd80 3045 if not entry:
3046 continue
7e88d7d7 3047 try:
d21056f4 3048 # The item may have just been added to archive. Don't break due to it
3049 if not self.ydl.params.get('lazy_playlist'):
3050 # TODO: Add auto-generated fields
3051 self.ydl._match_entry(entry, incomplete=True, silent=True)
7e88d7d7 3052 except (ExistingVideoReached, RejectedVideoReached):
3053 return
3054
7e9a6125 3055 def get_full_count(self):
3056 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 3057 return len(self)
3058 elif isinstance(self._entries, InAdvancePagedList):
3059 if self._entries._pagesize == 1:
3060 return self._entries._pagecount
3061
7e88d7d7 3062 @functools.cached_property
3063 def _getter(self):
3064 if isinstance(self._entries, list):
3065 def get_entry(i):
3066 try:
3067 entry = self._entries[i]
3068 except IndexError:
3069 entry = self.MissingEntry
3070 if not self.is_incomplete:
3071 raise self.IndexError()
3072 if entry is self.MissingEntry:
bc5c2f8a 3073 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
7e88d7d7 3074 return entry
3075 else:
3076 def get_entry(i):
3077 try:
3078 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3079 except (LazyList.IndexError, PagedList.IndexError):
3080 raise self.IndexError()
3081 return get_entry
3082
3083 def __getitem__(self, idx):
3084 if isinstance(idx, int):
3085 idx = slice(idx, idx)
3086
3087 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3088 step = 1 if idx.step is None else idx.step
3089 if idx.start is None:
3090 start = 0 if step > 0 else len(self) - 1
3091 else:
3092 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3093
3094 # NB: Do not call len(self) when idx == [:]
3095 if idx.stop is None:
3096 stop = 0 if step < 0 else float('inf')
3097 else:
3098 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3099 stop += [-1, 1][step > 0]
3100
3101 for i in frange(start, stop, step):
3102 if i < 0:
3103 continue
3104 try:
7e9a6125 3105 entry = self._getter(i)
3106 except self.IndexError:
3107 self.is_exhausted = True
3108 if step > 0:
7e88d7d7 3109 break
7e9a6125 3110 continue
7e88d7d7 3111 yield i + 1, entry
3112
3113 def __len__(self):
3114 return len(tuple(self[:]))
3115
3116 class IndexError(IndexError):
3117 pass
3118
3119
81c2f20b 3120def uppercase_escape(s):
676eb3f2 3121 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3122 return re.sub(
a612753d 3123 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3124 lambda m: unicode_escape(m.group(0))[0],
3125 s)
0fe2ff78
YCH
3126
3127
3128def lowercase_escape(s):
3129 unicode_escape = codecs.getdecoder('unicode_escape')
3130 return re.sub(
3131 r'\\u[0-9a-fA-F]{4}',
3132 lambda m: unicode_escape(m.group(0))[0],
3133 s)
b53466e1 3134
d05cfe06
S
3135
3136def escape_rfc3986(s):
3137 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3138 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3139
3140
3141def escape_url(url):
3142 """Escape URL as suggested by RFC 3986"""
14f25df2 3143 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3144 return url_parsed._replace(
efbed08d 3145 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3146 path=escape_rfc3986(url_parsed.path),
3147 params=escape_rfc3986(url_parsed.params),
3148 query=escape_rfc3986(url_parsed.query),
3149 fragment=escape_rfc3986(url_parsed.fragment)
3150 ).geturl()
3151
62e609ab 3152
96b9e9cf 3153def parse_qs(url, **kwargs):
3154 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
4dfbf869 3155
3156
62e609ab
PH
3157def read_batch_urls(batch_fd):
3158 def fixup(url):
14f25df2 3159 if not isinstance(url, str):
62e609ab 3160 url = url.decode('utf-8', 'replace')
8c04f0be 3161 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3162 for bom in BOM_UTF8:
3163 if url.startswith(bom):
3164 url = url[len(bom):]
3165 url = url.lstrip()
3166 if not url or url.startswith(('#', ';', ']')):
62e609ab 3167 return False
8c04f0be 3168 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3169 # However, it can be safely stripped out if following a whitespace
8c04f0be 3170 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3171
3172 with contextlib.closing(batch_fd) as fd:
3173 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3174
3175
3176def urlencode_postdata(*args, **kargs):
14f25df2 3177 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3178
3179
45b2ee6f 3180def update_url(url, *, query_update=None, **kwargs):
3181 """Replace URL components specified by kwargs
3182 @param url str or parse url tuple
3183 @param query_update update query
3184 @returns str
3185 """
3186 if isinstance(url, str):
3187 if not kwargs and not query_update:
3188 return url
3189 else:
3190 url = urllib.parse.urlparse(url)
3191 if query_update:
3192 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3193 kwargs['query'] = urllib.parse.urlencode({
3194 **urllib.parse.parse_qs(url.query),
3195 **query_update
3196 }, True)
3197 return urllib.parse.urlunparse(url._replace(**kwargs))
3198
3199
38f9ef31 3200def update_url_query(url, query):
45b2ee6f 3201 return update_url(url, query_update=query)
16392824 3202
8e60dc75 3203
c043c246 3204def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3205 req_headers = req.headers.copy()
c043c246 3206 req_headers.update(headers or {})
ed0291d1
S
3207 req_data = data or req.data
3208 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3209 req_get_method = req.get_method()
3210 if req_get_method == 'HEAD':
3211 req_type = HEADRequest
3212 elif req_get_method == 'PUT':
3213 req_type = PUTRequest
3214 else:
ac668111 3215 req_type = urllib.request.Request
ed0291d1
S
3216 new_req = req_type(
3217 req_url, data=req_data, headers=req_headers,
3218 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3219 if hasattr(req, 'timeout'):
3220 new_req.timeout = req.timeout
3221 return new_req
3222
3223
10c87c15 3224def _multipart_encode_impl(data, boundary):
0c265486
YCH
3225 content_type = 'multipart/form-data; boundary=%s' % boundary
3226
3227 out = b''
3228 for k, v in data.items():
3229 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3230 if isinstance(k, str):
0f06bcd7 3231 k = k.encode()
14f25df2 3232 if isinstance(v, str):
0f06bcd7 3233 v = v.encode()
0c265486
YCH
3234 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3235 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3236 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3237 if boundary.encode('ascii') in content:
3238 raise ValueError('Boundary overlaps with data')
3239 out += content
3240
3241 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3242
3243 return out, content_type
3244
3245
3246def multipart_encode(data, boundary=None):
3247 '''
3248 Encode a dict to RFC 7578-compliant form-data
3249
3250 data:
3251 A dict where keys and values can be either Unicode or bytes-like
3252 objects.
3253 boundary:
3254 If specified a Unicode object, it's used as the boundary. Otherwise
3255 a random boundary is generated.
3256
3257 Reference: https://tools.ietf.org/html/rfc7578
3258 '''
3259 has_specified_boundary = boundary is not None
3260
3261 while True:
3262 if boundary is None:
3263 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3264
3265 try:
10c87c15 3266 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3267 break
3268 except ValueError:
3269 if has_specified_boundary:
3270 raise
3271 boundary = None
3272
3273 return out, content_type
3274
3275
304ad45a 3276def variadic(x, allowed_types=(str, bytes, dict)):
3277 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3278
3279
86296ad2 3280def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3281 for val in map(d.get, variadic(key_or_keys)):
3282 if val is not None and (val or not skip_false_values):
3283 return val
3284 return default
cbecc9b9
S
3285
3286
c4f60dd7 3287def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3288 for f in funcs:
a32a9a7e 3289 try:
c4f60dd7 3290 val = f(*args, **kwargs)
ab029d7e 3291 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
a32a9a7e
S
3292 pass
3293 else:
c4f60dd7 3294 if expected_type is None or isinstance(val, expected_type):
3295 return val
3296
3297
3298def try_get(src, getter, expected_type=None):
3299 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3300
3301
90137ca4 3302def filter_dict(dct, cndn=lambda _, v: v is not None):
3303 return {k: v for k, v in dct.items() if cndn(k, v)}
3304
3305
6cc62232
S
3306def merge_dicts(*dicts):
3307 merged = {}
3308 for a_dict in dicts:
3309 for k, v in a_dict.items():
90137ca4 3310 if (v is not None and k not in merged
3311 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3312 merged[k] = v
3313 return merged
3314
3315
8e60dc75 3316def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3317 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3318
16392824 3319
a1a530b0
PH
3320US_RATINGS = {
3321 'G': 0,
3322 'PG': 10,
3323 'PG-13': 13,
3324 'R': 16,
3325 'NC': 18,
3326}
fac55558
PH
3327
3328
a8795327 3329TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3330 'TV-Y': 0,
3331 'TV-Y7': 7,
3332 'TV-G': 0,
3333 'TV-PG': 0,
3334 'TV-14': 14,
3335 'TV-MA': 17,
a8795327
S
3336}
3337
3338
146c80e2 3339def parse_age_limit(s):
19a03940 3340 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3341 if type(s) is int: # noqa: E721
a8795327 3342 return s if 0 <= s <= 21 else None
19a03940 3343 elif not isinstance(s, str):
d838b1bd 3344 return None
146c80e2 3345 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3346 if m:
3347 return int(m.group('age'))
5c5fae6d 3348 s = s.upper()
a8795327
S
3349 if s in US_RATINGS:
3350 return US_RATINGS[s]
5a16c9d9 3351 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3352 if m:
5a16c9d9 3353 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3354 return None
146c80e2
S
3355
3356
fac55558 3357def strip_jsonp(code):
609a61e3 3358 return re.sub(
5552c9eb 3359 r'''(?sx)^
e9c671d5 3360 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3361 (?:\s*&&\s*(?P=func_name))?
3362 \s*\(\s*(?P<callback_data>.*)\);?
3363 \s*?(?://[^\n]*)*$''',
3364 r'\g<callback_data>', code)
478c2c61
PH
3365
3366
8f53dc44 3367def js_to_json(code, vars={}, *, strict=False):
5c610515 3368 # vars is a dict of var, val pairs to substitute
0898c5c8 3369 STRING_QUOTES = '\'"`'
a71b812f 3370 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
c843e685 3371 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3372 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3373 INTEGER_TABLE = (
86e5f3ed 3374 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3375 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3376 )
3377
a71b812f
SS
3378 def process_escape(match):
3379 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3380 escape = match.group(1) or match.group(2)
3381
3382 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3383 else R'\u00' if escape == 'x'
3384 else '' if escape == '\n'
3385 else escape)
3386
0898c5c8
SS
3387 def template_substitute(match):
3388 evaluated = js_to_json(match.group(1), vars, strict=strict)
3389 if evaluated[0] == '"':
3390 return json.loads(evaluated)
3391 return evaluated
3392
e05f6939 3393 def fix_kv(m):
e7b6d122
PH
3394 v = m.group(0)
3395 if v in ('true', 'false', 'null'):
3396 return v
421ddcb8
C
3397 elif v in ('undefined', 'void 0'):
3398 return 'null'
8bdd16b4 3399 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
a71b812f
SS
3400 return ''
3401
3402 if v[0] in STRING_QUOTES:
0898c5c8
SS
3403 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3404 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
a71b812f
SS
3405 return f'"{escaped}"'
3406
3407 for regex, base in INTEGER_TABLE:
3408 im = re.match(regex, v)
3409 if im:
3410 i = int(im.group(1), base)
3411 return f'"{i}":' if v.endswith(':') else str(i)
3412
3413 if v in vars:
d5f043d1
C
3414 try:
3415 if not strict:
3416 json.loads(vars[v])
08e29b9f 3417 except json.JSONDecodeError:
d5f043d1
C
3418 return json.dumps(vars[v])
3419 else:
3420 return vars[v]
89ac4a19 3421
a71b812f
SS
3422 if not strict:
3423 return f'"{v}"'
5c610515 3424
a71b812f 3425 raise ValueError(f'Unknown value: {v}')
e05f6939 3426
8072ef2b 3427 def create_map(mobj):
3428 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3429
8072ef2b 3430 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3431 if not strict:
3432 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
f55523cf 3433 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
389896df 3434 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3435 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
febff4c1 3436
a71b812f
SS
3437 return re.sub(rf'''(?sx)
3438 {STRING_RE}|
3439 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
421ddcb8 3440 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
a71b812f
SS
3441 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3442 [0-9]+(?={SKIP_RE}:)|
8bdd16b4 3443 !+
a71b812f 3444 ''', fix_kv, code)
e05f6939
PH
3445
3446
478c2c61
PH
3447def qualities(quality_ids):
3448 """ Get a numeric quality value out of a list of possible values """
3449 def q(qid):
3450 try:
3451 return quality_ids.index(qid)
3452 except ValueError:
3453 return -1
3454 return q
3455
acd69589 3456
119e40ef 3457POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3458
3459
de6000d9 3460DEFAULT_OUTTMPL = {
3461 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3462 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3463}
3464OUTTMPL_TYPES = {
72755351 3465 'chapter': None,
de6000d9 3466 'subtitle': None,
3467 'thumbnail': None,
3468 'description': 'description',
3469 'annotation': 'annotations.xml',
3470 'infojson': 'info.json',
08438d2c 3471 'link': None,
3b603dbd 3472 'pl_video': None,
5112f26a 3473 'pl_thumbnail': None,
de6000d9 3474 'pl_description': 'description',
3475 'pl_infojson': 'info.json',
3476}
0a871f68 3477
143db31d 3478# As of [1] format syntax is:
3479# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3480# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3481STR_FORMAT_RE_TMPL = r'''(?x)
3482 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3483 %
524e2e4f 3484 (?P<has_key>\((?P<key>{0})\))?
752cda38 3485 (?P<format>
524e2e4f 3486 (?P<conversion>[#0\-+ ]+)?
3487 (?P<min_width>\d+)?
3488 (?P<precision>\.\d+)?
3489 (?P<len_mod>[hlL])? # unused in python
901130bb 3490 {1} # conversion type
752cda38 3491 )
143db31d 3492'''
3493
7d1eb38a 3494
901130bb 3495STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3496
7d1eb38a 3497
a020a0dc
PH
3498def limit_length(s, length):
3499 """ Add ellipses to overly long strings """
3500 if s is None:
3501 return None
3502 ELLIPSES = '...'
3503 if len(s) > length:
3504 return s[:length - len(ELLIPSES)] + ELLIPSES
3505 return s
48844745
PH
3506
3507
3508def version_tuple(v):
5f9b8394 3509 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3510
3511
3512def is_outdated_version(version, limit, assume_new=True):
3513 if not version:
3514 return not assume_new
3515 try:
3516 return version_tuple(version) < version_tuple(limit)
3517 except ValueError:
3518 return not assume_new
732ea2f0
PH
3519
3520
3521def ytdl_is_updateable():
7a5c1cfe 3522 """ Returns if yt-dlp can be updated with -U """
735d865e 3523
5d535b4a 3524 from .update import is_non_updateable
732ea2f0 3525
5d535b4a 3526 return not is_non_updateable()
7d4111ed
PH
3527
3528
3529def args_to_str(args):
3530 # Get a short string representation for a subprocess command
702ccf2d 3531 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3532
3533
9b9c5355 3534def error_to_compat_str(err):
cfb0511d 3535 return str(err)
fdae2358
S
3536
3537
a44ca5a4 3538def error_to_str(err):
3539 return f'{type(err).__name__}: {err}'
3540
3541
2647c933 3542def mimetype2ext(mt, default=NO_DEFAULT):
3543 if not isinstance(mt, str):
3544 if default is not NO_DEFAULT:
3545 return default
eb9ee194
S
3546 return None
3547
2647c933 3548 MAP = {
3549 # video
f6861ec9 3550 '3gpp': '3gp',
2647c933 3551 'mp2t': 'ts',
3552 'mp4': 'mp4',
3553 'mpeg': 'mpeg',
3554 'mpegurl': 'm3u8',
3555 'quicktime': 'mov',
3556 'webm': 'webm',
3557 'vp9': 'vp9',
f6861ec9 3558 'x-flv': 'flv',
2647c933 3559 'x-m4v': 'm4v',
3560 'x-matroska': 'mkv',
3561 'x-mng': 'mng',
a0d8d704 3562 'x-mp4-fragmented': 'mp4',
2647c933 3563 'x-ms-asf': 'asf',
a0d8d704 3564 'x-ms-wmv': 'wmv',
2647c933 3565 'x-msvideo': 'avi',
3566
3567 # application (streaming playlists)
b4173f15 3568 'dash+xml': 'mpd',
b4173f15 3569 'f4m+xml': 'f4m',
f164b971 3570 'hds+xml': 'f4m',
2647c933 3571 'vnd.apple.mpegurl': 'm3u8',
e910fe2f 3572 'vnd.ms-sstr+xml': 'ism',
2647c933 3573 'x-mpegurl': 'm3u8',
3574
3575 # audio
3576 'audio/mp4': 'm4a',
3577 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3578 # Using .mp3 as it's the most popular one
3579 'audio/mpeg': 'mp3',
d80ca5de 3580 'audio/webm': 'webm',
2647c933 3581 'audio/x-matroska': 'mka',
3582 'audio/x-mpegurl': 'm3u',
3583 'midi': 'mid',
3584 'ogg': 'ogg',
3585 'wav': 'wav',
3586 'wave': 'wav',
3587 'x-aac': 'aac',
3588 'x-flac': 'flac',
3589 'x-m4a': 'm4a',
3590 'x-realaudio': 'ra',
39e7107d 3591 'x-wav': 'wav',
9359f3d4 3592
2647c933 3593 # image
3594 'avif': 'avif',
3595 'bmp': 'bmp',
3596 'gif': 'gif',
3597 'jpeg': 'jpg',
3598 'png': 'png',
3599 'svg+xml': 'svg',
3600 'tiff': 'tif',
3601 'vnd.wap.wbmp': 'wbmp',
3602 'webp': 'webp',
3603 'x-icon': 'ico',
3604 'x-jng': 'jng',
3605 'x-ms-bmp': 'bmp',
3606
3607 # caption
3608 'filmstrip+json': 'fs',
3609 'smptett+xml': 'tt',
3610 'ttaf+xml': 'dfxp',
3611 'ttml+xml': 'ttml',
3612 'x-ms-sami': 'sami',
9359f3d4 3613
2647c933 3614 # misc
3615 'gzip': 'gz',
9359f3d4
F
3616 'json': 'json',
3617 'xml': 'xml',
3618 'zip': 'zip',
9359f3d4
F
3619 }
3620
2647c933 3621 mimetype = mt.partition(';')[0].strip().lower()
3622 _, _, subtype = mimetype.rpartition('/')
9359f3d4 3623
2647c933 3624 ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3625 if ext:
3626 return ext
3627 elif default is not NO_DEFAULT:
3628 return default
9359f3d4 3629 return subtype.replace('+', '.')
c460bdd5
PH
3630
3631
2814f12b
THD
3632def ext2mimetype(ext_or_url):
3633 if not ext_or_url:
3634 return None
3635 if '.' not in ext_or_url:
3636 ext_or_url = f'file.{ext_or_url}'
3637 return mimetypes.guess_type(ext_or_url)[0]
3638
3639
4f3c5e06 3640def parse_codecs(codecs_str):
3641 # http://tools.ietf.org/html/rfc6381
3642 if not codecs_str:
3643 return {}
a0566bbf 3644 split_codecs = list(filter(None, map(
dbf5416a 3645 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3646 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3647 for full_codec in split_codecs:
d816f61f 3648 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3649 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3650 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3651 if vcodec:
3652 continue
3653 vcodec = full_codec
3654 if parts[0] in ('dvh1', 'dvhe'):
3655 hdr = 'DV'
3656 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3657 hdr = 'HDR10'
3658 elif parts[:2] == ['vp9', '2']:
3659 hdr = 'HDR10'
71082216 3660 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
d816f61f 3661 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3662 acodec = acodec or full_codec
3663 elif parts[0] in ('stpp', 'wvtt'):
3664 scodec = scodec or full_codec
4f3c5e06 3665 else:
19a03940 3666 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3667 if vcodec or acodec or scodec:
4f3c5e06 3668 return {
3669 'vcodec': vcodec or 'none',
3670 'acodec': acodec or 'none',
176f1866 3671 'dynamic_range': hdr,
3fe75fdc 3672 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3673 }
b69fd25c 3674 elif len(split_codecs) == 2:
3675 return {
3676 'vcodec': split_codecs[0],
3677 'acodec': split_codecs[1],
3678 }
4f3c5e06 3679 return {}
3680
3681
fc61aff4
LL
3682def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3683 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3684
3685 allow_mkv = not preferences or 'mkv' in preferences
3686
3687 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3688 return 'mkv' # TODO: any other format allows this?
3689
3690 # TODO: All codecs supported by parse_codecs isn't handled here
3691 COMPATIBLE_CODECS = {
3692 'mp4': {
71082216 3693 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
81b6102d 3694 'h264', 'aacl', 'ec-3', # Set in ISM
fc61aff4
LL
3695 },
3696 'webm': {
3697 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3698 'vp9x', 'vp8x', # in the webm spec
3699 },
3700 }
3701
a5387729 3702 sanitize_codec = functools.partial(
3703 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
8f84770a 3704 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3705
3706 for ext in preferences or COMPATIBLE_CODECS.keys():
3707 codec_set = COMPATIBLE_CODECS.get(ext, set())
3708 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3709 return ext
3710
3711 COMPATIBLE_EXTS = (
3712 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
fbb73833 3713 {'webm', 'weba'},
fc61aff4
LL
3714 )
3715 for ext in preferences or vexts:
3716 current_exts = {ext, *vexts, *aexts}
3717 if ext == 'mkv' or current_exts == {ext} or any(
3718 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3719 return ext
3720 return 'mkv' if allow_mkv else preferences[-1]
3721
3722
2647c933 3723def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
79298173 3724 getheader = url_handle.headers.get
2ccd1b10 3725
b55ee18f
PH
3726 cd = getheader('Content-Disposition')
3727 if cd:
3728 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3729 if m:
3730 e = determine_ext(m.group('filename'), default_ext=None)
3731 if e:
3732 return e
3733
2647c933 3734 meta_ext = getheader('x-amz-meta-name')
3735 if meta_ext:
3736 e = meta_ext.rpartition('.')[2]
3737 if e:
3738 return e
3739
3740 return mimetype2ext(getheader('Content-Type'), default=default)
05900629
PH
3741
3742
1e399778
YCH
3743def encode_data_uri(data, mime_type):
3744 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3745
3746
05900629 3747def age_restricted(content_limit, age_limit):
6ec6cb4e 3748 """ Returns True iff the content should be blocked """
05900629
PH
3749
3750 if age_limit is None: # No limit set
3751 return False
3752 if content_limit is None:
3753 return False # Content available for everyone
3754 return age_limit < content_limit
61ca9a80
PH
3755
3756
88f60feb 3757# List of known byte-order-marks (BOM)
a904a7f8
L
3758BOMS = [
3759 (b'\xef\xbb\xbf', 'utf-8'),
3760 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3761 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3762 (b'\xff\xfe', 'utf-16-le'),
3763 (b'\xfe\xff', 'utf-16-be'),
3764]
a904a7f8
L
3765
3766
61ca9a80
PH
3767def is_html(first_bytes):
3768 """ Detect whether a file contains HTML by examining its first bytes. """
3769
80e8493e 3770 encoding = 'utf-8'
61ca9a80 3771 for bom, enc in BOMS:
80e8493e 3772 while first_bytes.startswith(bom):
3773 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3774
80e8493e 3775 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3776
3777
3778def determine_protocol(info_dict):
3779 protocol = info_dict.get('protocol')
3780 if protocol is not None:
3781 return protocol
3782
7de837a5 3783 url = sanitize_url(info_dict['url'])
a055469f
PH
3784 if url.startswith('rtmp'):
3785 return 'rtmp'
3786 elif url.startswith('mms'):
3787 return 'mms'
3788 elif url.startswith('rtsp'):
3789 return 'rtsp'
3790
3791 ext = determine_ext(url)
3792 if ext == 'm3u8':
deae7c17 3793 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3794 elif ext == 'f4m':
3795 return 'f4m'
3796
14f25df2 3797 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3798
3799
c5e3f849 3800def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3801 """ Render a list of rows, each as a list of values.
3802 Text after a \t will be right aligned """
ec11a9f4 3803 def width(string):
c5e3f849 3804 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3805
3806 def get_max_lens(table):
ec11a9f4 3807 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3808
3809 def filter_using_list(row, filterArray):
d16df59d 3810 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3811
d16df59d 3812 max_lens = get_max_lens(data) if hide_empty else []
3813 header_row = filter_using_list(header_row, max_lens)
3814 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3815
cfb56d1a 3816 table = [header_row] + data
76d321f6 3817 max_lens = get_max_lens(table)
c5e3f849 3818 extra_gap += 1
76d321f6 3819 if delim:
c5e3f849 3820 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3821 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3822 for row in table:
3823 for pos, text in enumerate(map(str, row)):
c5e3f849 3824 if '\t' in text:
3825 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3826 else:
3827 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3828 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3829 return ret
347de493
PH
3830
3831
8f18aca8 3832def _match_one(filter_part, dct, incomplete):
77b87f05 3833 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3834 STRING_OPERATORS = {
3835 '*=': operator.contains,
3836 '^=': lambda attr, value: attr.startswith(value),
3837 '$=': lambda attr, value: attr.endswith(value),
3838 '~=': lambda attr, value: re.search(value, attr),
3839 }
347de493 3840 COMPARISON_OPERATORS = {
a047eeb6 3841 **STRING_OPERATORS,
3842 '<=': operator.le, # "<=" must be defined above "<"
347de493 3843 '<': operator.lt,
347de493 3844 '>=': operator.ge,
a047eeb6 3845 '>': operator.gt,
347de493 3846 '=': operator.eq,
347de493 3847 }
a047eeb6 3848
6db9c4d5 3849 if isinstance(incomplete, bool):
3850 is_incomplete = lambda _: incomplete
3851 else:
3852 is_incomplete = lambda k: k in incomplete
3853
64fa820c 3854 operator_rex = re.compile(r'''(?x)
347de493 3855 (?P<key>[a-z_]+)
77b87f05 3856 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3857 (?:
a047eeb6 3858 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3859 (?P<strval>.+?)
347de493 3860 )
347de493 3861 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3862 m = operator_rex.fullmatch(filter_part.strip())
347de493 3863 if m:
18f96d12 3864 m = m.groupdict()
3865 unnegated_op = COMPARISON_OPERATORS[m['op']]
3866 if m['negation']:
77b87f05
MT
3867 op = lambda attr, value: not unnegated_op(attr, value)
3868 else:
3869 op = unnegated_op
18f96d12 3870 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3871 if m['quote']:
3872 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3873 actual_value = dct.get(m['key'])
3874 numeric_comparison = None
f9934b96 3875 if isinstance(actual_value, (int, float)):
e5a088dc
S
3876 # If the original field is a string and matching comparisonvalue is
3877 # a number we should respect the origin of the original field
3878 # and process comparison value as a string (see
18f96d12 3879 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3880 try:
18f96d12 3881 numeric_comparison = int(comparison_value)
347de493 3882 except ValueError:
18f96d12 3883 numeric_comparison = parse_filesize(comparison_value)
3884 if numeric_comparison is None:
3885 numeric_comparison = parse_filesize(f'{comparison_value}B')
3886 if numeric_comparison is None:
3887 numeric_comparison = parse_duration(comparison_value)
3888 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3889 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3890 if actual_value is None:
6db9c4d5 3891 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3892 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3893
3894 UNARY_OPERATORS = {
1cc47c66
S
3895 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3896 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3897 }
64fa820c 3898 operator_rex = re.compile(r'''(?x)
347de493 3899 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3900 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3901 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3902 if m:
3903 op = UNARY_OPERATORS[m.group('op')]
3904 actual_value = dct.get(m.group('key'))
6db9c4d5 3905 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3906 return True
347de493
PH
3907 return op(actual_value)
3908
3909 raise ValueError('Invalid filter part %r' % filter_part)
3910
3911
8f18aca8 3912def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3913 """ Filter a dictionary with a simple string syntax.
3914 @returns Whether the filter passes
3915 @param incomplete Set of keys that is expected to be missing from dct.
3916 Can be True/False to indicate all/none of the keys may be missing.
3917 All conditions on incomplete keys pass if the key is missing
8f18aca8 3918 """
347de493 3919 return all(
8f18aca8 3920 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3921 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3922
3923
fe2ce85a 3924def match_filter_func(filters, breaking_filters=None):
3925 if not filters and not breaking_filters:
d1b5f70b 3926 return None
fe2ce85a 3927 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3928 filters = set(variadic(filters or []))
d1b5f70b 3929
492272fe 3930 interactive = '-' in filters
3931 if interactive:
3932 filters.remove('-')
3933
3934 def _match_func(info_dict, incomplete=False):
fe2ce85a 3935 ret = breaking_filters(info_dict, incomplete)
3936 if ret is not None:
3937 raise RejectedVideoReached(ret)
3938
492272fe 3939 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3940 return NO_DEFAULT if interactive and not incomplete else None
347de493 3941 else:
3bec830a 3942 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3943 filter_str = ') | ('.join(map(str.strip, filters))
3944 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3945 return _match_func
91410c9b
PH
3946
3947
f2df4071 3948class download_range_func:
3949 def __init__(self, chapters, ranges):
3950 self.chapters, self.ranges = chapters, ranges
3951
3952 def __call__(self, info_dict, ydl):
0500ee3d 3953 if not self.ranges and not self.chapters:
3954 yield {}
3955
5ec1b6b7 3956 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3957 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3958 for regex in self.chapters or []:
5ec1b6b7 3959 for i, chapter in enumerate(info_dict.get('chapters') or []):
3960 if re.search(regex, chapter['title']):
3961 warning = None
3962 yield {**chapter, 'index': i}
f2df4071 3963 if self.chapters and warning:
5ec1b6b7 3964 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3965
f2df4071 3966 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3967
f2df4071 3968 def __eq__(self, other):
3969 return (isinstance(other, download_range_func)
3970 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3971
71df9b7f 3972 def __repr__(self):
a5387729 3973 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
71df9b7f 3974
5ec1b6b7 3975
bf6427d2
YCH
3976def parse_dfxp_time_expr(time_expr):
3977 if not time_expr:
d631d5f9 3978 return
bf6427d2 3979
1d485a1a 3980 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3981 if mobj:
3982 return float(mobj.group('time_offset'))
3983
db2fe38b 3984 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3985 if mobj:
db2fe38b 3986 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3987
3988
c1c924ab 3989def srt_subtitles_timecode(seconds):
aa7785f8 3990 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3991
3992
3993def ass_subtitles_timecode(seconds):
3994 time = timetuple_from_msec(seconds * 1000)
3995 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3996
3997
3998def dfxp2srt(dfxp_data):
3869028f
YCH
3999 '''
4000 @param dfxp_data A bytes-like object containing DFXP data
4001 @returns A unicode object containing converted SRT data
4002 '''
5b995f71 4003 LEGACY_NAMESPACES = (
3869028f
YCH
4004 (b'http://www.w3.org/ns/ttml', [
4005 b'http://www.w3.org/2004/11/ttaf1',
4006 b'http://www.w3.org/2006/04/ttaf1',
4007 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 4008 ]),
3869028f
YCH
4009 (b'http://www.w3.org/ns/ttml#styling', [
4010 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
4011 ]),
4012 )
4013
4014 SUPPORTED_STYLING = [
4015 'color',
4016 'fontFamily',
4017 'fontSize',
4018 'fontStyle',
4019 'fontWeight',
4020 'textDecoration'
4021 ]
4022
4e335771 4023 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 4024 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 4025 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 4026 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 4027 })
bf6427d2 4028
5b995f71
RA
4029 styles = {}
4030 default_style = {}
4031
86e5f3ed 4032 class TTMLPElementParser:
5b995f71
RA
4033 _out = ''
4034 _unclosed_elements = []
4035 _applied_styles = []
bf6427d2 4036
2b14cb56 4037 def start(self, tag, attrib):
5b995f71
RA
4038 if tag in (_x('ttml:br'), 'br'):
4039 self._out += '\n'
4040 else:
4041 unclosed_elements = []
4042 style = {}
4043 element_style_id = attrib.get('style')
4044 if default_style:
4045 style.update(default_style)
4046 if element_style_id:
4047 style.update(styles.get(element_style_id, {}))
4048 for prop in SUPPORTED_STYLING:
4049 prop_val = attrib.get(_x('tts:' + prop))
4050 if prop_val:
4051 style[prop] = prop_val
4052 if style:
4053 font = ''
4054 for k, v in sorted(style.items()):
4055 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4056 continue
4057 if k == 'color':
4058 font += ' color="%s"' % v
4059 elif k == 'fontSize':
4060 font += ' size="%s"' % v
4061 elif k == 'fontFamily':
4062 font += ' face="%s"' % v
4063 elif k == 'fontWeight' and v == 'bold':
4064 self._out += '<b>'
4065 unclosed_elements.append('b')
4066 elif k == 'fontStyle' and v == 'italic':
4067 self._out += '<i>'
4068 unclosed_elements.append('i')
4069 elif k == 'textDecoration' and v == 'underline':
4070 self._out += '<u>'
4071 unclosed_elements.append('u')
4072 if font:
4073 self._out += '<font' + font + '>'
4074 unclosed_elements.append('font')
4075 applied_style = {}
4076 if self._applied_styles:
4077 applied_style.update(self._applied_styles[-1])
4078 applied_style.update(style)
4079 self._applied_styles.append(applied_style)
4080 self._unclosed_elements.append(unclosed_elements)
bf6427d2 4081
2b14cb56 4082 def end(self, tag):
5b995f71
RA
4083 if tag not in (_x('ttml:br'), 'br'):
4084 unclosed_elements = self._unclosed_elements.pop()
4085 for element in reversed(unclosed_elements):
4086 self._out += '</%s>' % element
4087 if unclosed_elements and self._applied_styles:
4088 self._applied_styles.pop()
bf6427d2 4089
2b14cb56 4090 def data(self, data):
5b995f71 4091 self._out += data
2b14cb56 4092
4093 def close(self):
5b995f71 4094 return self._out.strip()
2b14cb56 4095
6a765f13 4096 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
4097 # This will not trigger false positives since only UTF-8 text is being replaced
4098 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
4099
2b14cb56 4100 def parse_node(node):
4101 target = TTMLPElementParser()
4102 parser = xml.etree.ElementTree.XMLParser(target=target)
4103 parser.feed(xml.etree.ElementTree.tostring(node))
4104 return parser.close()
bf6427d2 4105
5b995f71
RA
4106 for k, v in LEGACY_NAMESPACES:
4107 for ns in v:
4108 dfxp_data = dfxp_data.replace(ns, k)
4109
3869028f 4110 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 4111 out = []
5b995f71 4112 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
4113
4114 if not paras:
4115 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 4116
5b995f71
RA
4117 repeat = False
4118 while True:
4119 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
4120 style_id = style.get('id') or style.get(_x('xml:id'))
4121 if not style_id:
4122 continue
5b995f71
RA
4123 parent_style_id = style.get('style')
4124 if parent_style_id:
4125 if parent_style_id not in styles:
4126 repeat = True
4127 continue
4128 styles[style_id] = styles[parent_style_id].copy()
4129 for prop in SUPPORTED_STYLING:
4130 prop_val = style.get(_x('tts:' + prop))
4131 if prop_val:
4132 styles.setdefault(style_id, {})[prop] = prop_val
4133 if repeat:
4134 repeat = False
4135 else:
4136 break
4137
4138 for p in ('body', 'div'):
4139 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4140 if ele is None:
4141 continue
4142 style = styles.get(ele.get('style'))
4143 if not style:
4144 continue
4145 default_style.update(style)
4146
bf6427d2 4147 for para, index in zip(paras, itertools.count(1)):
d631d5f9 4148 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 4149 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
4150 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4151 if begin_time is None:
4152 continue
7dff0363 4153 if not end_time:
d631d5f9
YCH
4154 if not dur:
4155 continue
4156 end_time = begin_time + dur
bf6427d2
YCH
4157 out.append('%d\n%s --> %s\n%s\n\n' % (
4158 index,
c1c924ab
YCH
4159 srt_subtitles_timecode(begin_time),
4160 srt_subtitles_timecode(end_time),
bf6427d2
YCH
4161 parse_node(para)))
4162
4163 return ''.join(out)
4164
4165
c487cf00 4166def cli_option(params, command_option, param, separator=None):
66e289ba 4167 param = params.get(param)
c487cf00 4168 return ([] if param is None
4169 else [command_option, str(param)] if separator is None
4170 else [f'{command_option}{separator}{param}'])
66e289ba
S
4171
4172
4173def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4174 param = params.get(param)
c487cf00 4175 assert param in (True, False, None)
4176 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4177
4178
4179def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4180 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4181
4182
e92caff5 4183def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4184 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4185 if use_compat:
5b1ecbb3 4186 return argdict
4187 else:
4188 argdict = None
eab9b2bc 4189 if argdict is None:
5b1ecbb3 4190 return default
eab9b2bc 4191 assert isinstance(argdict, dict)
4192
e92caff5 4193 assert isinstance(keys, (list, tuple))
4194 for key_list in keys:
e92caff5 4195 arg_list = list(filter(
4196 lambda x: x is not None,
6606817a 4197 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4198 if arg_list:
4199 return [arg for args in arg_list for arg in args]
4200 return default
66e289ba 4201
6251555f 4202
330690a2 4203def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4204 main_key, exe = main_key.lower(), exe.lower()
4205 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4206 keys = [f'{root_key}{k}' for k in (keys or [''])]
4207 if root_key in keys:
4208 if main_key != exe:
4209 keys.append((main_key, exe))
4210 keys.append('default')
4211 else:
4212 use_compat = False
4213 return cli_configuration_args(argdict, keys, default, use_compat)
4214
66e289ba 4215
86e5f3ed 4216class ISO639Utils:
39672624
YCH
4217 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4218 _lang_map = {
4219 'aa': 'aar',
4220 'ab': 'abk',
4221 'ae': 'ave',
4222 'af': 'afr',
4223 'ak': 'aka',
4224 'am': 'amh',
4225 'an': 'arg',
4226 'ar': 'ara',
4227 'as': 'asm',
4228 'av': 'ava',
4229 'ay': 'aym',
4230 'az': 'aze',
4231 'ba': 'bak',
4232 'be': 'bel',
4233 'bg': 'bul',
4234 'bh': 'bih',
4235 'bi': 'bis',
4236 'bm': 'bam',
4237 'bn': 'ben',
4238 'bo': 'bod',
4239 'br': 'bre',
4240 'bs': 'bos',
4241 'ca': 'cat',
4242 'ce': 'che',
4243 'ch': 'cha',
4244 'co': 'cos',
4245 'cr': 'cre',
4246 'cs': 'ces',
4247 'cu': 'chu',
4248 'cv': 'chv',
4249 'cy': 'cym',
4250 'da': 'dan',
4251 'de': 'deu',
4252 'dv': 'div',
4253 'dz': 'dzo',
4254 'ee': 'ewe',
4255 'el': 'ell',
4256 'en': 'eng',
4257 'eo': 'epo',
4258 'es': 'spa',
4259 'et': 'est',
4260 'eu': 'eus',
4261 'fa': 'fas',
4262 'ff': 'ful',
4263 'fi': 'fin',
4264 'fj': 'fij',
4265 'fo': 'fao',
4266 'fr': 'fra',
4267 'fy': 'fry',
4268 'ga': 'gle',
4269 'gd': 'gla',
4270 'gl': 'glg',
4271 'gn': 'grn',
4272 'gu': 'guj',
4273 'gv': 'glv',
4274 'ha': 'hau',
4275 'he': 'heb',
b7acc835 4276 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4277 'hi': 'hin',
4278 'ho': 'hmo',
4279 'hr': 'hrv',
4280 'ht': 'hat',
4281 'hu': 'hun',
4282 'hy': 'hye',
4283 'hz': 'her',
4284 'ia': 'ina',
4285 'id': 'ind',
b7acc835 4286 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4287 'ie': 'ile',
4288 'ig': 'ibo',
4289 'ii': 'iii',
4290 'ik': 'ipk',
4291 'io': 'ido',
4292 'is': 'isl',
4293 'it': 'ita',
4294 'iu': 'iku',
4295 'ja': 'jpn',
4296 'jv': 'jav',
4297 'ka': 'kat',
4298 'kg': 'kon',
4299 'ki': 'kik',
4300 'kj': 'kua',
4301 'kk': 'kaz',
4302 'kl': 'kal',
4303 'km': 'khm',
4304 'kn': 'kan',
4305 'ko': 'kor',
4306 'kr': 'kau',
4307 'ks': 'kas',
4308 'ku': 'kur',
4309 'kv': 'kom',
4310 'kw': 'cor',
4311 'ky': 'kir',
4312 'la': 'lat',
4313 'lb': 'ltz',
4314 'lg': 'lug',
4315 'li': 'lim',
4316 'ln': 'lin',
4317 'lo': 'lao',
4318 'lt': 'lit',
4319 'lu': 'lub',
4320 'lv': 'lav',
4321 'mg': 'mlg',
4322 'mh': 'mah',
4323 'mi': 'mri',
4324 'mk': 'mkd',
4325 'ml': 'mal',
4326 'mn': 'mon',
4327 'mr': 'mar',
4328 'ms': 'msa',
4329 'mt': 'mlt',
4330 'my': 'mya',
4331 'na': 'nau',
4332 'nb': 'nob',
4333 'nd': 'nde',
4334 'ne': 'nep',
4335 'ng': 'ndo',
4336 'nl': 'nld',
4337 'nn': 'nno',
4338 'no': 'nor',
4339 'nr': 'nbl',
4340 'nv': 'nav',
4341 'ny': 'nya',
4342 'oc': 'oci',
4343 'oj': 'oji',
4344 'om': 'orm',
4345 'or': 'ori',
4346 'os': 'oss',
4347 'pa': 'pan',
4348 'pi': 'pli',
4349 'pl': 'pol',
4350 'ps': 'pus',
4351 'pt': 'por',
4352 'qu': 'que',
4353 'rm': 'roh',
4354 'rn': 'run',
4355 'ro': 'ron',
4356 'ru': 'rus',
4357 'rw': 'kin',
4358 'sa': 'san',
4359 'sc': 'srd',
4360 'sd': 'snd',
4361 'se': 'sme',
4362 'sg': 'sag',
4363 'si': 'sin',
4364 'sk': 'slk',
4365 'sl': 'slv',
4366 'sm': 'smo',
4367 'sn': 'sna',
4368 'so': 'som',
4369 'sq': 'sqi',
4370 'sr': 'srp',
4371 'ss': 'ssw',
4372 'st': 'sot',
4373 'su': 'sun',
4374 'sv': 'swe',
4375 'sw': 'swa',
4376 'ta': 'tam',
4377 'te': 'tel',
4378 'tg': 'tgk',
4379 'th': 'tha',
4380 'ti': 'tir',
4381 'tk': 'tuk',
4382 'tl': 'tgl',
4383 'tn': 'tsn',
4384 'to': 'ton',
4385 'tr': 'tur',
4386 'ts': 'tso',
4387 'tt': 'tat',
4388 'tw': 'twi',
4389 'ty': 'tah',
4390 'ug': 'uig',
4391 'uk': 'ukr',
4392 'ur': 'urd',
4393 'uz': 'uzb',
4394 've': 'ven',
4395 'vi': 'vie',
4396 'vo': 'vol',
4397 'wa': 'wln',
4398 'wo': 'wol',
4399 'xh': 'xho',
4400 'yi': 'yid',
e9a50fba 4401 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4402 'yo': 'yor',
4403 'za': 'zha',
4404 'zh': 'zho',
4405 'zu': 'zul',
4406 }
4407
4408 @classmethod
4409 def short2long(cls, code):
4410 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4411 return cls._lang_map.get(code[:2])
4412
4413 @classmethod
4414 def long2short(cls, code):
4415 """Convert language code from ISO 639-2/T to ISO 639-1"""
4416 for short_name, long_name in cls._lang_map.items():
4417 if long_name == code:
4418 return short_name
4419
4420
86e5f3ed 4421class ISO3166Utils:
4eb10f66
YCH
4422 # From http://data.okfn.org/data/core/country-list
4423 _country_map = {
4424 'AF': 'Afghanistan',
4425 'AX': 'Åland Islands',
4426 'AL': 'Albania',
4427 'DZ': 'Algeria',
4428 'AS': 'American Samoa',
4429 'AD': 'Andorra',
4430 'AO': 'Angola',
4431 'AI': 'Anguilla',
4432 'AQ': 'Antarctica',
4433 'AG': 'Antigua and Barbuda',
4434 'AR': 'Argentina',
4435 'AM': 'Armenia',
4436 'AW': 'Aruba',
4437 'AU': 'Australia',
4438 'AT': 'Austria',
4439 'AZ': 'Azerbaijan',
4440 'BS': 'Bahamas',
4441 'BH': 'Bahrain',
4442 'BD': 'Bangladesh',
4443 'BB': 'Barbados',
4444 'BY': 'Belarus',
4445 'BE': 'Belgium',
4446 'BZ': 'Belize',
4447 'BJ': 'Benin',
4448 'BM': 'Bermuda',
4449 'BT': 'Bhutan',
4450 'BO': 'Bolivia, Plurinational State of',
4451 'BQ': 'Bonaire, Sint Eustatius and Saba',
4452 'BA': 'Bosnia and Herzegovina',
4453 'BW': 'Botswana',
4454 'BV': 'Bouvet Island',
4455 'BR': 'Brazil',
4456 'IO': 'British Indian Ocean Territory',
4457 'BN': 'Brunei Darussalam',
4458 'BG': 'Bulgaria',
4459 'BF': 'Burkina Faso',
4460 'BI': 'Burundi',
4461 'KH': 'Cambodia',
4462 'CM': 'Cameroon',
4463 'CA': 'Canada',
4464 'CV': 'Cape Verde',
4465 'KY': 'Cayman Islands',
4466 'CF': 'Central African Republic',
4467 'TD': 'Chad',
4468 'CL': 'Chile',
4469 'CN': 'China',
4470 'CX': 'Christmas Island',
4471 'CC': 'Cocos (Keeling) Islands',
4472 'CO': 'Colombia',
4473 'KM': 'Comoros',
4474 'CG': 'Congo',
4475 'CD': 'Congo, the Democratic Republic of the',
4476 'CK': 'Cook Islands',
4477 'CR': 'Costa Rica',
4478 'CI': 'Côte d\'Ivoire',
4479 'HR': 'Croatia',
4480 'CU': 'Cuba',
4481 'CW': 'Curaçao',
4482 'CY': 'Cyprus',
4483 'CZ': 'Czech Republic',
4484 'DK': 'Denmark',
4485 'DJ': 'Djibouti',
4486 'DM': 'Dominica',
4487 'DO': 'Dominican Republic',
4488 'EC': 'Ecuador',
4489 'EG': 'Egypt',
4490 'SV': 'El Salvador',
4491 'GQ': 'Equatorial Guinea',
4492 'ER': 'Eritrea',
4493 'EE': 'Estonia',
4494 'ET': 'Ethiopia',
4495 'FK': 'Falkland Islands (Malvinas)',
4496 'FO': 'Faroe Islands',
4497 'FJ': 'Fiji',
4498 'FI': 'Finland',
4499 'FR': 'France',
4500 'GF': 'French Guiana',
4501 'PF': 'French Polynesia',
4502 'TF': 'French Southern Territories',
4503 'GA': 'Gabon',
4504 'GM': 'Gambia',
4505 'GE': 'Georgia',
4506 'DE': 'Germany',
4507 'GH': 'Ghana',
4508 'GI': 'Gibraltar',
4509 'GR': 'Greece',
4510 'GL': 'Greenland',
4511 'GD': 'Grenada',
4512 'GP': 'Guadeloupe',
4513 'GU': 'Guam',
4514 'GT': 'Guatemala',
4515 'GG': 'Guernsey',
4516 'GN': 'Guinea',
4517 'GW': 'Guinea-Bissau',
4518 'GY': 'Guyana',
4519 'HT': 'Haiti',
4520 'HM': 'Heard Island and McDonald Islands',
4521 'VA': 'Holy See (Vatican City State)',
4522 'HN': 'Honduras',
4523 'HK': 'Hong Kong',
4524 'HU': 'Hungary',
4525 'IS': 'Iceland',
4526 'IN': 'India',
4527 'ID': 'Indonesia',
4528 'IR': 'Iran, Islamic Republic of',
4529 'IQ': 'Iraq',
4530 'IE': 'Ireland',
4531 'IM': 'Isle of Man',
4532 'IL': 'Israel',
4533 'IT': 'Italy',
4534 'JM': 'Jamaica',
4535 'JP': 'Japan',
4536 'JE': 'Jersey',
4537 'JO': 'Jordan',
4538 'KZ': 'Kazakhstan',
4539 'KE': 'Kenya',
4540 'KI': 'Kiribati',
4541 'KP': 'Korea, Democratic People\'s Republic of',
4542 'KR': 'Korea, Republic of',
4543 'KW': 'Kuwait',
4544 'KG': 'Kyrgyzstan',
4545 'LA': 'Lao People\'s Democratic Republic',
4546 'LV': 'Latvia',
4547 'LB': 'Lebanon',
4548 'LS': 'Lesotho',
4549 'LR': 'Liberia',
4550 'LY': 'Libya',
4551 'LI': 'Liechtenstein',
4552 'LT': 'Lithuania',
4553 'LU': 'Luxembourg',
4554 'MO': 'Macao',
4555 'MK': 'Macedonia, the Former Yugoslav Republic of',
4556 'MG': 'Madagascar',
4557 'MW': 'Malawi',
4558 'MY': 'Malaysia',
4559 'MV': 'Maldives',
4560 'ML': 'Mali',
4561 'MT': 'Malta',
4562 'MH': 'Marshall Islands',
4563 'MQ': 'Martinique',
4564 'MR': 'Mauritania',
4565 'MU': 'Mauritius',
4566 'YT': 'Mayotte',
4567 'MX': 'Mexico',
4568 'FM': 'Micronesia, Federated States of',
4569 'MD': 'Moldova, Republic of',
4570 'MC': 'Monaco',
4571 'MN': 'Mongolia',
4572 'ME': 'Montenegro',
4573 'MS': 'Montserrat',
4574 'MA': 'Morocco',
4575 'MZ': 'Mozambique',
4576 'MM': 'Myanmar',
4577 'NA': 'Namibia',
4578 'NR': 'Nauru',
4579 'NP': 'Nepal',
4580 'NL': 'Netherlands',
4581 'NC': 'New Caledonia',
4582 'NZ': 'New Zealand',
4583 'NI': 'Nicaragua',
4584 'NE': 'Niger',
4585 'NG': 'Nigeria',
4586 'NU': 'Niue',
4587 'NF': 'Norfolk Island',
4588 'MP': 'Northern Mariana Islands',
4589 'NO': 'Norway',
4590 'OM': 'Oman',
4591 'PK': 'Pakistan',
4592 'PW': 'Palau',
4593 'PS': 'Palestine, State of',
4594 'PA': 'Panama',
4595 'PG': 'Papua New Guinea',
4596 'PY': 'Paraguay',
4597 'PE': 'Peru',
4598 'PH': 'Philippines',
4599 'PN': 'Pitcairn',
4600 'PL': 'Poland',
4601 'PT': 'Portugal',
4602 'PR': 'Puerto Rico',
4603 'QA': 'Qatar',
4604 'RE': 'Réunion',
4605 'RO': 'Romania',
4606 'RU': 'Russian Federation',
4607 'RW': 'Rwanda',
4608 'BL': 'Saint Barthélemy',
4609 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4610 'KN': 'Saint Kitts and Nevis',
4611 'LC': 'Saint Lucia',
4612 'MF': 'Saint Martin (French part)',
4613 'PM': 'Saint Pierre and Miquelon',
4614 'VC': 'Saint Vincent and the Grenadines',
4615 'WS': 'Samoa',
4616 'SM': 'San Marino',
4617 'ST': 'Sao Tome and Principe',
4618 'SA': 'Saudi Arabia',
4619 'SN': 'Senegal',
4620 'RS': 'Serbia',
4621 'SC': 'Seychelles',
4622 'SL': 'Sierra Leone',
4623 'SG': 'Singapore',
4624 'SX': 'Sint Maarten (Dutch part)',
4625 'SK': 'Slovakia',
4626 'SI': 'Slovenia',
4627 'SB': 'Solomon Islands',
4628 'SO': 'Somalia',
4629 'ZA': 'South Africa',
4630 'GS': 'South Georgia and the South Sandwich Islands',
4631 'SS': 'South Sudan',
4632 'ES': 'Spain',
4633 'LK': 'Sri Lanka',
4634 'SD': 'Sudan',
4635 'SR': 'Suriname',
4636 'SJ': 'Svalbard and Jan Mayen',
4637 'SZ': 'Swaziland',
4638 'SE': 'Sweden',
4639 'CH': 'Switzerland',
4640 'SY': 'Syrian Arab Republic',
4641 'TW': 'Taiwan, Province of China',
4642 'TJ': 'Tajikistan',
4643 'TZ': 'Tanzania, United Republic of',
4644 'TH': 'Thailand',
4645 'TL': 'Timor-Leste',
4646 'TG': 'Togo',
4647 'TK': 'Tokelau',
4648 'TO': 'Tonga',
4649 'TT': 'Trinidad and Tobago',
4650 'TN': 'Tunisia',
4651 'TR': 'Turkey',
4652 'TM': 'Turkmenistan',
4653 'TC': 'Turks and Caicos Islands',
4654 'TV': 'Tuvalu',
4655 'UG': 'Uganda',
4656 'UA': 'Ukraine',
4657 'AE': 'United Arab Emirates',
4658 'GB': 'United Kingdom',
4659 'US': 'United States',
4660 'UM': 'United States Minor Outlying Islands',
4661 'UY': 'Uruguay',
4662 'UZ': 'Uzbekistan',
4663 'VU': 'Vanuatu',
4664 'VE': 'Venezuela, Bolivarian Republic of',
4665 'VN': 'Viet Nam',
4666 'VG': 'Virgin Islands, British',
4667 'VI': 'Virgin Islands, U.S.',
4668 'WF': 'Wallis and Futuna',
4669 'EH': 'Western Sahara',
4670 'YE': 'Yemen',
4671 'ZM': 'Zambia',
4672 'ZW': 'Zimbabwe',
2f97cc61 4673 # Not ISO 3166 codes, but used for IP blocks
4674 'AP': 'Asia/Pacific Region',
4675 'EU': 'Europe',
4eb10f66
YCH
4676 }
4677
4678 @classmethod
4679 def short2full(cls, code):
4680 """Convert an ISO 3166-2 country code to the corresponding full name"""
4681 return cls._country_map.get(code.upper())
4682
4683
86e5f3ed 4684class GeoUtils:
773f291d
S
4685 # Major IPv4 address blocks per country
4686 _country_ip_map = {
53896ca5 4687 'AD': '46.172.224.0/19',
773f291d
S
4688 'AE': '94.200.0.0/13',
4689 'AF': '149.54.0.0/17',
4690 'AG': '209.59.64.0/18',
4691 'AI': '204.14.248.0/21',
4692 'AL': '46.99.0.0/16',
4693 'AM': '46.70.0.0/15',
4694 'AO': '105.168.0.0/13',
53896ca5
S
4695 'AP': '182.50.184.0/21',
4696 'AQ': '23.154.160.0/24',
773f291d
S
4697 'AR': '181.0.0.0/12',
4698 'AS': '202.70.112.0/20',
53896ca5 4699 'AT': '77.116.0.0/14',
773f291d
S
4700 'AU': '1.128.0.0/11',
4701 'AW': '181.41.0.0/18',
53896ca5
S
4702 'AX': '185.217.4.0/22',
4703 'AZ': '5.197.0.0/16',
773f291d
S
4704 'BA': '31.176.128.0/17',
4705 'BB': '65.48.128.0/17',
4706 'BD': '114.130.0.0/16',
4707 'BE': '57.0.0.0/8',
53896ca5 4708 'BF': '102.178.0.0/15',
773f291d
S
4709 'BG': '95.42.0.0/15',
4710 'BH': '37.131.0.0/17',
4711 'BI': '154.117.192.0/18',
4712 'BJ': '137.255.0.0/16',
53896ca5 4713 'BL': '185.212.72.0/23',
773f291d
S
4714 'BM': '196.12.64.0/18',
4715 'BN': '156.31.0.0/16',
4716 'BO': '161.56.0.0/16',
4717 'BQ': '161.0.80.0/20',
53896ca5 4718 'BR': '191.128.0.0/12',
773f291d
S
4719 'BS': '24.51.64.0/18',
4720 'BT': '119.2.96.0/19',
4721 'BW': '168.167.0.0/16',
4722 'BY': '178.120.0.0/13',
4723 'BZ': '179.42.192.0/18',
4724 'CA': '99.224.0.0/11',
4725 'CD': '41.243.0.0/16',
53896ca5
S
4726 'CF': '197.242.176.0/21',
4727 'CG': '160.113.0.0/16',
773f291d 4728 'CH': '85.0.0.0/13',
53896ca5 4729 'CI': '102.136.0.0/14',
773f291d
S
4730 'CK': '202.65.32.0/19',
4731 'CL': '152.172.0.0/14',
53896ca5 4732 'CM': '102.244.0.0/14',
773f291d
S
4733 'CN': '36.128.0.0/10',
4734 'CO': '181.240.0.0/12',
4735 'CR': '201.192.0.0/12',
4736 'CU': '152.206.0.0/15',
4737 'CV': '165.90.96.0/19',
4738 'CW': '190.88.128.0/17',
53896ca5 4739 'CY': '31.153.0.0/16',
773f291d
S
4740 'CZ': '88.100.0.0/14',
4741 'DE': '53.0.0.0/8',
4742 'DJ': '197.241.0.0/17',
4743 'DK': '87.48.0.0/12',
4744 'DM': '192.243.48.0/20',
4745 'DO': '152.166.0.0/15',
4746 'DZ': '41.96.0.0/12',
4747 'EC': '186.68.0.0/15',
4748 'EE': '90.190.0.0/15',
4749 'EG': '156.160.0.0/11',
4750 'ER': '196.200.96.0/20',
4751 'ES': '88.0.0.0/11',
4752 'ET': '196.188.0.0/14',
4753 'EU': '2.16.0.0/13',
4754 'FI': '91.152.0.0/13',
4755 'FJ': '144.120.0.0/16',
53896ca5 4756 'FK': '80.73.208.0/21',
773f291d
S
4757 'FM': '119.252.112.0/20',
4758 'FO': '88.85.32.0/19',
4759 'FR': '90.0.0.0/9',
4760 'GA': '41.158.0.0/15',
4761 'GB': '25.0.0.0/8',
4762 'GD': '74.122.88.0/21',
4763 'GE': '31.146.0.0/16',
4764 'GF': '161.22.64.0/18',
4765 'GG': '62.68.160.0/19',
53896ca5
S
4766 'GH': '154.160.0.0/12',
4767 'GI': '95.164.0.0/16',
773f291d
S
4768 'GL': '88.83.0.0/19',
4769 'GM': '160.182.0.0/15',
4770 'GN': '197.149.192.0/18',
4771 'GP': '104.250.0.0/19',
4772 'GQ': '105.235.224.0/20',
4773 'GR': '94.64.0.0/13',
4774 'GT': '168.234.0.0/16',
4775 'GU': '168.123.0.0/16',
4776 'GW': '197.214.80.0/20',
4777 'GY': '181.41.64.0/18',
4778 'HK': '113.252.0.0/14',
4779 'HN': '181.210.0.0/16',
4780 'HR': '93.136.0.0/13',
4781 'HT': '148.102.128.0/17',
4782 'HU': '84.0.0.0/14',
4783 'ID': '39.192.0.0/10',
4784 'IE': '87.32.0.0/12',
4785 'IL': '79.176.0.0/13',
4786 'IM': '5.62.80.0/20',
4787 'IN': '117.192.0.0/10',
4788 'IO': '203.83.48.0/21',
4789 'IQ': '37.236.0.0/14',
4790 'IR': '2.176.0.0/12',
4791 'IS': '82.221.0.0/16',
4792 'IT': '79.0.0.0/10',
4793 'JE': '87.244.64.0/18',
4794 'JM': '72.27.0.0/17',
4795 'JO': '176.29.0.0/16',
53896ca5 4796 'JP': '133.0.0.0/8',
773f291d
S
4797 'KE': '105.48.0.0/12',
4798 'KG': '158.181.128.0/17',
4799 'KH': '36.37.128.0/17',
4800 'KI': '103.25.140.0/22',
4801 'KM': '197.255.224.0/20',
53896ca5 4802 'KN': '198.167.192.0/19',
773f291d
S
4803 'KP': '175.45.176.0/22',
4804 'KR': '175.192.0.0/10',
4805 'KW': '37.36.0.0/14',
4806 'KY': '64.96.0.0/15',
4807 'KZ': '2.72.0.0/13',
4808 'LA': '115.84.64.0/18',
4809 'LB': '178.135.0.0/16',
53896ca5 4810 'LC': '24.92.144.0/20',
773f291d
S
4811 'LI': '82.117.0.0/19',
4812 'LK': '112.134.0.0/15',
53896ca5 4813 'LR': '102.183.0.0/16',
773f291d
S
4814 'LS': '129.232.0.0/17',
4815 'LT': '78.56.0.0/13',
4816 'LU': '188.42.0.0/16',
4817 'LV': '46.109.0.0/16',
4818 'LY': '41.252.0.0/14',
4819 'MA': '105.128.0.0/11',
4820 'MC': '88.209.64.0/18',
4821 'MD': '37.246.0.0/16',
4822 'ME': '178.175.0.0/17',
4823 'MF': '74.112.232.0/21',
4824 'MG': '154.126.0.0/17',
4825 'MH': '117.103.88.0/21',
4826 'MK': '77.28.0.0/15',
4827 'ML': '154.118.128.0/18',
4828 'MM': '37.111.0.0/17',
4829 'MN': '49.0.128.0/17',
4830 'MO': '60.246.0.0/16',
4831 'MP': '202.88.64.0/20',
4832 'MQ': '109.203.224.0/19',
4833 'MR': '41.188.64.0/18',
4834 'MS': '208.90.112.0/22',
4835 'MT': '46.11.0.0/16',
4836 'MU': '105.16.0.0/12',
4837 'MV': '27.114.128.0/18',
53896ca5 4838 'MW': '102.70.0.0/15',
773f291d
S
4839 'MX': '187.192.0.0/11',
4840 'MY': '175.136.0.0/13',
4841 'MZ': '197.218.0.0/15',
4842 'NA': '41.182.0.0/16',
4843 'NC': '101.101.0.0/18',
4844 'NE': '197.214.0.0/18',
4845 'NF': '203.17.240.0/22',
4846 'NG': '105.112.0.0/12',
4847 'NI': '186.76.0.0/15',
4848 'NL': '145.96.0.0/11',
4849 'NO': '84.208.0.0/13',
4850 'NP': '36.252.0.0/15',
4851 'NR': '203.98.224.0/19',
4852 'NU': '49.156.48.0/22',
4853 'NZ': '49.224.0.0/14',
4854 'OM': '5.36.0.0/15',
4855 'PA': '186.72.0.0/15',
4856 'PE': '186.160.0.0/14',
4857 'PF': '123.50.64.0/18',
4858 'PG': '124.240.192.0/19',
4859 'PH': '49.144.0.0/13',
4860 'PK': '39.32.0.0/11',
4861 'PL': '83.0.0.0/11',
4862 'PM': '70.36.0.0/20',
4863 'PR': '66.50.0.0/16',
4864 'PS': '188.161.0.0/16',
4865 'PT': '85.240.0.0/13',
4866 'PW': '202.124.224.0/20',
4867 'PY': '181.120.0.0/14',
4868 'QA': '37.210.0.0/15',
53896ca5 4869 'RE': '102.35.0.0/16',
773f291d 4870 'RO': '79.112.0.0/13',
53896ca5 4871 'RS': '93.86.0.0/15',
773f291d 4872 'RU': '5.136.0.0/13',
53896ca5 4873 'RW': '41.186.0.0/16',
773f291d
S
4874 'SA': '188.48.0.0/13',
4875 'SB': '202.1.160.0/19',
4876 'SC': '154.192.0.0/11',
53896ca5 4877 'SD': '102.120.0.0/13',
773f291d 4878 'SE': '78.64.0.0/12',
53896ca5 4879 'SG': '8.128.0.0/10',
773f291d
S
4880 'SI': '188.196.0.0/14',
4881 'SK': '78.98.0.0/15',
53896ca5 4882 'SL': '102.143.0.0/17',
773f291d
S
4883 'SM': '89.186.32.0/19',
4884 'SN': '41.82.0.0/15',
53896ca5 4885 'SO': '154.115.192.0/18',
773f291d
S
4886 'SR': '186.179.128.0/17',
4887 'SS': '105.235.208.0/21',
4888 'ST': '197.159.160.0/19',
4889 'SV': '168.243.0.0/16',
4890 'SX': '190.102.0.0/20',
4891 'SY': '5.0.0.0/16',
4892 'SZ': '41.84.224.0/19',
4893 'TC': '65.255.48.0/20',
4894 'TD': '154.68.128.0/19',
4895 'TG': '196.168.0.0/14',
4896 'TH': '171.96.0.0/13',
4897 'TJ': '85.9.128.0/18',
4898 'TK': '27.96.24.0/21',
4899 'TL': '180.189.160.0/20',
4900 'TM': '95.85.96.0/19',
4901 'TN': '197.0.0.0/11',
4902 'TO': '175.176.144.0/21',
4903 'TR': '78.160.0.0/11',
4904 'TT': '186.44.0.0/15',
4905 'TV': '202.2.96.0/19',
4906 'TW': '120.96.0.0/11',
4907 'TZ': '156.156.0.0/14',
53896ca5
S
4908 'UA': '37.52.0.0/14',
4909 'UG': '102.80.0.0/13',
4910 'US': '6.0.0.0/8',
773f291d 4911 'UY': '167.56.0.0/13',
53896ca5 4912 'UZ': '84.54.64.0/18',
773f291d 4913 'VA': '212.77.0.0/19',
53896ca5 4914 'VC': '207.191.240.0/21',
773f291d 4915 'VE': '186.88.0.0/13',
53896ca5 4916 'VG': '66.81.192.0/20',
773f291d
S
4917 'VI': '146.226.0.0/16',
4918 'VN': '14.160.0.0/11',
4919 'VU': '202.80.32.0/20',
4920 'WF': '117.20.32.0/21',
4921 'WS': '202.4.32.0/19',
4922 'YE': '134.35.0.0/16',
4923 'YT': '41.242.116.0/22',
4924 'ZA': '41.0.0.0/11',
53896ca5
S
4925 'ZM': '102.144.0.0/13',
4926 'ZW': '102.177.192.0/18',
773f291d
S
4927 }
4928
4929 @classmethod
5f95927a
S
4930 def random_ipv4(cls, code_or_block):
4931 if len(code_or_block) == 2:
4932 block = cls._country_ip_map.get(code_or_block.upper())
4933 if not block:
4934 return None
4935 else:
4936 block = code_or_block
773f291d 4937 addr, preflen = block.split('/')
ac668111 4938 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4939 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4940 return str(socket.inet_ntoa(
ac668111 4941 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4942
4943
ac668111 4944class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4945 def __init__(self, proxies=None):
4946 # Set default handlers
4947 for type in ('http', 'https'):
4948 setattr(self, '%s_open' % type,
4949 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4950 meth(r, proxy, type))
ac668111 4951 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4952
91410c9b 4953 def proxy_open(self, req, proxy, type):
2461f79d 4954 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4955 if req_proxy is not None:
4956 proxy = req_proxy
2461f79d
PH
4957 del req.headers['Ytdl-request-proxy']
4958
4959 if proxy == '__noproxy__':
4960 return None # No Proxy
14f25df2 4961 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4962 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4963 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4964 return None
ac668111 4965 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4966 self, req, proxy, type)
5bc880b9
YCH
4967
4968
0a5445dd
YCH
4969# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4970# released into Public Domain
4971# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4972
4973def long_to_bytes(n, blocksize=0):
4974 """long_to_bytes(n:long, blocksize:int) : string
4975 Convert a long integer to a byte string.
4976
4977 If optional blocksize is given and greater than zero, pad the front of the
4978 byte string with binary zeros so that the length is a multiple of
4979 blocksize.
4980 """
4981 # after much testing, this algorithm was deemed to be the fastest
4982 s = b''
4983 n = int(n)
4984 while n > 0:
ac668111 4985 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4986 n = n >> 32
4987 # strip off leading zeros
4988 for i in range(len(s)):
4989 if s[i] != b'\000'[0]:
4990 break
4991 else:
4992 # only happens when n == 0
4993 s = b'\000'
4994 i = 0
4995 s = s[i:]
4996 # add back some pad bytes. this could be done more efficiently w.r.t. the
4997 # de-padding being done above, but sigh...
4998 if blocksize > 0 and len(s) % blocksize:
4999 s = (blocksize - len(s) % blocksize) * b'\000' + s
5000 return s
5001
5002
5003def bytes_to_long(s):
5004 """bytes_to_long(string) : long
5005 Convert a byte string to a long integer.
5006
5007 This is (essentially) the inverse of long_to_bytes().
5008 """
5009 acc = 0
5010 length = len(s)
5011 if length % 4:
5012 extra = (4 - length % 4)
5013 s = b'\000' * extra + s
5014 length = length + extra
5015 for i in range(0, length, 4):
ac668111 5016 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
5017 return acc
5018
5019
5bc880b9
YCH
5020def ohdave_rsa_encrypt(data, exponent, modulus):
5021 '''
5022 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
5023
5024 Input:
5025 data: data to encrypt, bytes-like object
5026 exponent, modulus: parameter e and N of RSA algorithm, both integer
5027 Output: hex string of encrypted data
5028
5029 Limitation: supports one block encryption only
5030 '''
5031
5032 payload = int(binascii.hexlify(data[::-1]), 16)
5033 encrypted = pow(payload, exponent, modulus)
5034 return '%x' % encrypted
81bdc8fd
YCH
5035
5036
f48409c7
YCH
5037def pkcs1pad(data, length):
5038 """
5039 Padding input data with PKCS#1 scheme
5040
5041 @param {int[]} data input data
5042 @param {int} length target length
5043 @returns {int[]} padded data
5044 """
5045 if len(data) > length - 11:
5046 raise ValueError('Input data too long for PKCS#1 padding')
5047
5048 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
5049 return [0, 2] + pseudo_random + [0] + data
5050
5051
7b2c3f47 5052def _base_n_table(n, table):
5053 if not table and not n:
5054 raise ValueError('Either table or n must be specified')
612f2be5 5055 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
5056
44f14eb4 5057 if n and n != len(table):
612f2be5 5058 raise ValueError(f'base {n} exceeds table length {len(table)}')
5059 return table
59f898b7 5060
5eb6bdce 5061
7b2c3f47 5062def encode_base_n(num, n=None, table=None):
5063 """Convert given int to a base-n string"""
612f2be5 5064 table = _base_n_table(n, table)
7b2c3f47 5065 if not num:
5eb6bdce
YCH
5066 return table[0]
5067
7b2c3f47 5068 result, base = '', len(table)
81bdc8fd 5069 while num:
7b2c3f47 5070 result = table[num % base] + result
612f2be5 5071 num = num // base
7b2c3f47 5072 return result
5073
5074
5075def decode_base_n(string, n=None, table=None):
5076 """Convert given base-n string to int"""
5077 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5078 result, base = 0, len(table)
5079 for char in string:
5080 result = result * base + table[char]
5081 return result
5082
5083
5084def decode_base(value, digits):
da4db748 5085 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
5086 f'in a future version. Use {__name__}.decode_base_n instead')
7b2c3f47 5087 return decode_base_n(value, table=digits)
f52354a8
YCH
5088
5089
5090def decode_packed_codes(code):
06b3fe29 5091 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 5092 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
5093 base = int(base)
5094 count = int(count)
5095 symbols = symbols.split('|')
5096 symbol_table = {}
5097
5098 while count:
5099 count -= 1
5eb6bdce 5100 base_n_count = encode_base_n(count, base)
f52354a8
YCH
5101 symbol_table[base_n_count] = symbols[count] or base_n_count
5102
5103 return re.sub(
5104 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 5105 obfuscated_code)
e154c651 5106
5107
1ced2221
S
5108def caesar(s, alphabet, shift):
5109 if shift == 0:
5110 return s
5111 l = len(alphabet)
5112 return ''.join(
5113 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5114 for c in s)
5115
5116
5117def rot47(s):
5118 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5119
5120
e154c651 5121def parse_m3u8_attributes(attrib):
5122 info = {}
5123 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5124 if val.startswith('"'):
5125 val = val[1:-1]
5126 info[key] = val
5127 return info
1143535d
YCH
5128
5129
5130def urshift(val, n):
5131 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
5132
5133
5134# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 5135# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
5136def decode_png(png_data):
5137 # Reference: https://www.w3.org/TR/PNG/
5138 header = png_data[8:]
5139
5140 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 5141 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
5142
5143 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 5144 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
5145
5146 chunks = []
5147
5148 while header:
5149 length = unpack_integer(header[:4])
5150 header = header[4:]
5151
5152 chunk_type = header[:4]
5153 header = header[4:]
5154
5155 chunk_data = header[:length]
5156 header = header[length:]
5157
5158 header = header[4:] # Skip CRC
5159
5160 chunks.append({
5161 'type': chunk_type,
5162 'length': length,
5163 'data': chunk_data
5164 })
5165
5166 ihdr = chunks[0]['data']
5167
5168 width = unpack_integer(ihdr[:4])
5169 height = unpack_integer(ihdr[4:8])
5170
5171 idat = b''
5172
5173 for chunk in chunks:
5174 if chunk['type'] == b'IDAT':
5175 idat += chunk['data']
5176
5177 if not idat:
86e5f3ed 5178 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
5179
5180 decompressed_data = bytearray(zlib.decompress(idat))
5181
5182 stride = width * 3
5183 pixels = []
5184
5185 def _get_pixel(idx):
5186 x = idx % stride
5187 y = idx // stride
5188 return pixels[y][x]
5189
5190 for y in range(height):
5191 basePos = y * (1 + stride)
5192 filter_type = decompressed_data[basePos]
5193
5194 current_row = []
5195
5196 pixels.append(current_row)
5197
5198 for x in range(stride):
5199 color = decompressed_data[1 + basePos + x]
5200 basex = y * stride + x
5201 left = 0
5202 up = 0
5203
5204 if x > 2:
5205 left = _get_pixel(basex - 3)
5206 if y > 0:
5207 up = _get_pixel(basex - stride)
5208
5209 if filter_type == 1: # Sub
5210 color = (color + left) & 0xff
5211 elif filter_type == 2: # Up
5212 color = (color + up) & 0xff
5213 elif filter_type == 3: # Average
5214 color = (color + ((left + up) >> 1)) & 0xff
5215 elif filter_type == 4: # Paeth
5216 a = left
5217 b = up
5218 c = 0
5219
5220 if x > 2 and y > 0:
5221 c = _get_pixel(basex - stride - 3)
5222
5223 p = a + b - c
5224
5225 pa = abs(p - a)
5226 pb = abs(p - b)
5227 pc = abs(p - c)
5228
5229 if pa <= pb and pa <= pc:
5230 color = (color + a) & 0xff
5231 elif pb <= pc:
5232 color = (color + b) & 0xff
5233 else:
5234 color = (color + c) & 0xff
5235
5236 current_row.append(color)
5237
5238 return width, height, pixels
efa97bdc
YCH
5239
5240
5241def write_xattr(path, key, value):
6f7563be 5242 # Windows: Write xattrs to NTFS Alternate Data Streams:
5243 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5244 if compat_os_name == 'nt':
5245 assert ':' not in key
5246 assert os.path.exists(path)
efa97bdc
YCH
5247
5248 try:
6f7563be 5249 with open(f'{path}:{key}', 'wb') as f:
5250 f.write(value)
86e5f3ed 5251 except OSError as e:
efa97bdc 5252 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5253 return
efa97bdc 5254
6f7563be 5255 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5256
6f7563be 5257 setxattr = None
5258 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5259 # Unicode arguments are not supported in pyxattr until version 0.5.0
5260 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5261 if version_tuple(xattr.__version__) >= (0, 5, 0):
5262 setxattr = xattr.set
5263 elif xattr:
5264 setxattr = xattr.setxattr
efa97bdc 5265
6f7563be 5266 if setxattr:
5267 try:
5268 setxattr(path, key, value)
5269 except OSError as e:
5270 raise XAttrMetadataError(e.errno, e.strerror)
5271 return
efa97bdc 5272
6f7563be 5273 # UNIX Method 2. Use setfattr/xattr executables
5274 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5275 else 'xattr' if check_executable('xattr', ['-h']) else None)
5276 if not exe:
5277 raise XAttrUnavailableError(
5278 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5279 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5280
0f06bcd7 5281 value = value.decode()
6f7563be 5282 try:
f0c9fb96 5283 _, stderr, returncode = Popen.run(
6f7563be 5284 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5285 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5286 except OSError as e:
5287 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5288 if returncode:
5289 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5290
5291
5292def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5293 start_date = datetime.date(1950, 1, 1)
5294 end_date = datetime.date(1995, 12, 31)
5295 offset = random.randint(0, (end_date - start_date).days)
5296 random_date = start_date + datetime.timedelta(offset)
0c265486 5297 return {
aa374bc7
AS
5298 year_field: str(random_date.year),
5299 month_field: str(random_date.month),
5300 day_field: str(random_date.day),
0c265486 5301 }
732044af 5302
c76eb41b 5303
8c53322c
L
5304def find_available_port(interface=''):
5305 try:
5306 with socket.socket() as sock:
5307 sock.bind((interface, 0))
5308 return sock.getsockname()[1]
5309 except OSError:
5310 return None
5311
5312
732044af 5313# Templates for internet shortcut files, which are plain text files.
e5a998f3 5314DOT_URL_LINK_TEMPLATE = '''\
732044af 5315[InternetShortcut]
5316URL=%(url)s
e5a998f3 5317'''
732044af 5318
e5a998f3 5319DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5320<?xml version="1.0" encoding="UTF-8"?>
5321<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5322<plist version="1.0">
5323<dict>
5324\t<key>URL</key>
5325\t<string>%(url)s</string>
5326</dict>
5327</plist>
e5a998f3 5328'''
732044af 5329
e5a998f3 5330DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5331[Desktop Entry]
5332Encoding=UTF-8
5333Name=%(filename)s
5334Type=Link
5335URL=%(url)s
5336Icon=text-html
e5a998f3 5337'''
732044af 5338
08438d2c 5339LINK_TEMPLATES = {
5340 'url': DOT_URL_LINK_TEMPLATE,
5341 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5342 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5343}
5344
732044af 5345
5346def iri_to_uri(iri):
5347 """
5348 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5349
5350 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5351 """
5352
14f25df2 5353 iri_parts = urllib.parse.urlparse(iri)
732044af 5354
5355 if '[' in iri_parts.netloc:
5356 raise ValueError('IPv6 URIs are not, yet, supported.')
5357 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5358
5359 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5360
5361 net_location = ''
5362 if iri_parts.username:
f9934b96 5363 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5364 if iri_parts.password is not None:
f9934b96 5365 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5366 net_location += '@'
5367
0f06bcd7 5368 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5369 # The 'idna' encoding produces ASCII text.
5370 if iri_parts.port is not None and iri_parts.port != 80:
5371 net_location += ':' + str(iri_parts.port)
5372
f9934b96 5373 return urllib.parse.urlunparse(
732044af 5374 (iri_parts.scheme,
5375 net_location,
5376
f9934b96 5377 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5378
5379 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5380 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5381
5382 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5383 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5384
f9934b96 5385 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5386
5387 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5388
5389
5390def to_high_limit_path(path):
5391 if sys.platform in ['win32', 'cygwin']:
5392 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5393 return '\\\\?\\' + os.path.abspath(path)
732044af 5394
5395 return path
76d321f6 5396
c76eb41b 5397
7b2c3f47 5398def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5399 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5400 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5401 return default
7b2c3f47 5402 return template % func(val)
00dd0cd5 5403
5404
5405def clean_podcast_url(url):
5406 return re.sub(r'''(?x)
5407 (?:
5408 (?:
5409 chtbl\.com/track|
5410 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5411 play\.podtrac\.com
5412 )/[^/]+|
5413 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5414 flex\.acast\.com|
5415 pd(?:
5416 cn\.co| # https://podcorn.com/analytics-prefix/
5417 st\.fm # https://podsights.com/docs/
5418 )/e
5419 )/''', '', url)
ffcb8191
THD
5420
5421
5422_HEX_TABLE = '0123456789abcdef'
5423
5424
5425def random_uuidv4():
5426 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5427
5428
5429def make_dir(path, to_screen=None):
5430 try:
5431 dn = os.path.dirname(path)
b25d6cb9
AI
5432 if dn:
5433 os.makedirs(dn, exist_ok=True)
0202b52a 5434 return True
86e5f3ed 5435 except OSError as err:
0202b52a 5436 if callable(to_screen) is not None:
5437 to_screen('unable to create directory ' + error_to_compat_str(err))
5438 return False
f74980cb 5439
5440
5441def get_executable_path():
b5899f4f 5442 from .update import _get_variant_and_executable_path
c487cf00 5443
b5899f4f 5444 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5445
5446
8e40b9d1 5447def get_user_config_dirs(package_name):
8e40b9d1
M
5448 # .config (e.g. ~/.config/package_name)
5449 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
773c272d 5450 yield os.path.join(xdg_config_home, package_name)
8e40b9d1
M
5451
5452 # appdata (%APPDATA%/package_name)
5453 appdata_dir = os.getenv('appdata')
5454 if appdata_dir:
773c272d 5455 yield os.path.join(appdata_dir, package_name)
8e40b9d1
M
5456
5457 # home (~/.package_name)
773c272d 5458 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
8e40b9d1
M
5459
5460
5461def get_system_config_dirs(package_name):
8e40b9d1 5462 # /etc/package_name
773c272d 5463 yield os.path.join('/etc', package_name)
06167fbb 5464
5465
325ebc17 5466def traverse_obj(
f99bbfc9 5467 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
325ebc17 5468 casesense=True, is_user_input=False, traverse_string=False):
ab029d7e
SS
5469 """
5470 Safely traverse nested `dict`s and `Sequence`s
5471
5472 >>> obj = [{}, {"key": "value"}]
5473 >>> traverse_obj(obj, (1, "key"))
5474 "value"
5475
5476 Each of the provided `paths` is tested and the first producing a valid result will be returned.
f99bbfc9 5477 The next path will also be tested if the path branched but no results could be found.
7b0127e1 5478 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
6839ae1f 5479 Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
ab029d7e
SS
5480
5481 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5482
5483 The keys in the path can be one of:
5484 - `None`: Return the current object.
776995bc
SS
5485 - `set`: Requires the only item in the set to be a type or function,
5486 like `{type}`/`{func}`. If a `type`, returns only values
5487 of this type. If a function, returns `func(obj)`.
8e174ba7 5488 - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`.
ab029d7e
SS
5489 - `slice`: Branch out and return all values in `obj[key]`.
5490 - `Ellipsis`: Branch out and return a list of all values.
5491 - `tuple`/`list`: Branch out and return a list of all matching values.
5492 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5493 - `function`: Branch out and return values filtered by the function.
5494 Read as: `[value for key, value in obj if function(key, value)]`.
5495 For `Sequence`s, `key` is the index of the value.
776995bc
SS
5496 For `re.Match`es, `key` is the group number (0 = full match)
5497 as well as additionally any group names, if given.
ab029d7e
SS
5498 - `dict` Transform the current object and return a matching dict.
5499 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5500
7b0127e1 5501 `tuple`, `list`, and `dict` all support nested paths and branches.
ab029d7e
SS
5502
5503 @params paths Paths which to traverse by.
5504 @param default Value to return if the paths do not match.
b1bde57b
SS
5505 If the last key in the path is a `dict`, it will apply to each value inside
5506 the dict instead, depth first. Try to avoid if using nested `dict` keys.
ab029d7e
SS
5507 @param expected_type If a `type`, only accept final values of this type.
5508 If any other callable, try to call the function on each result.
776995bc
SS
5509 If the last key in the path is a `dict`, it will apply to each value inside
5510 the dict instead, recursively. This does respect branching paths.
ab029d7e
SS
5511 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5512 @param casesense If `False`, consider string dictionary keys as case insensitive.
5513
5514 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5515
5516 @param is_user_input Whether the keys are generated from user input.
5517 If `True` strings get converted to `int`/`slice` if needed.
5518 @param traverse_string Whether to traverse into objects as strings.
5519 If `True`, any non-compatible object will first be
5520 converted into a string and then traversed into.
b1bde57b
SS
5521 The return value of that path will be a string instead,
5522 not respecting any further branching.
ab029d7e
SS
5523
5524
5525 @returns The result of the object traversal.
5526 If successful, `get_all=True`, and the path branches at least once,
5527 then a list of results is returned instead.
b1bde57b
SS
5528 If no `default` is given and the last path branches, a `list` of results
5529 is always returned. If a path ends on a `dict` that result will always be a `dict`.
ab029d7e 5530 """
ab029d7e 5531 casefold = lambda k: k.casefold() if isinstance(k, str) else k
325ebc17 5532
352d63fd 5533 if isinstance(expected_type, type):
5534 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5535 else:
ab029d7e
SS
5536 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5537
b1bde57b
SS
5538 def apply_key(key, obj, is_last):
5539 branching = False
5540 result = None
5541
6839ae1f 5542 if obj is None and traverse_string:
b1bde57b 5543 pass
ab029d7e
SS
5544
5545 elif key is None:
b1bde57b 5546 result = obj
ab029d7e 5547
776995bc
SS
5548 elif isinstance(key, set):
5549 assert len(key) == 1, 'Set should only be used to wrap a single item'
5550 item = next(iter(key))
5551 if isinstance(item, type):
5552 if isinstance(obj, item):
b1bde57b 5553 result = obj
776995bc 5554 else:
b1bde57b 5555 result = try_call(item, args=(obj,))
776995bc 5556
ab029d7e 5557 elif isinstance(key, (list, tuple)):
b1bde57b
SS
5558 branching = True
5559 result = itertools.chain.from_iterable(
5560 apply_path(obj, branch, is_last)[0] for branch in key)
ab029d7e
SS
5561
5562 elif key is ...:
b1bde57b 5563 branching = True
ab029d7e 5564 if isinstance(obj, collections.abc.Mapping):
b1bde57b 5565 result = obj.values()
21b5ec86 5566 elif isinstance(obj, collections.abc.Iterable) and not isinstance(obj, (str, bytes)):
b1bde57b 5567 result = obj
7b0127e1 5568 elif isinstance(obj, re.Match):
b1bde57b 5569 result = obj.groups()
ab029d7e 5570 elif traverse_string:
b1bde57b
SS
5571 branching = False
5572 result = str(obj)
5573 else:
5574 result = ()
ab029d7e
SS
5575
5576 elif callable(key):
b1bde57b
SS
5577 branching = True
5578 if isinstance(obj, collections.abc.Mapping):
ab029d7e 5579 iter_obj = obj.items()
21b5ec86 5580 elif isinstance(obj, collections.abc.Iterable) and not isinstance(obj, (str, bytes)):
b1bde57b 5581 iter_obj = enumerate(obj)
7b0127e1 5582 elif isinstance(obj, re.Match):
776995bc
SS
5583 iter_obj = itertools.chain(
5584 enumerate((obj.group(), *obj.groups())),
5585 obj.groupdict().items())
ab029d7e 5586 elif traverse_string:
b1bde57b 5587 branching = False
ab029d7e 5588 iter_obj = enumerate(str(obj))
352d63fd 5589 else:
b1bde57b
SS
5590 iter_obj = ()
5591
5592 result = (v for k, v in iter_obj if try_call(key, args=(k, v)))
5593 if not branching: # string traversal
5594 result = ''.join(result)
ab029d7e
SS
5595
5596 elif isinstance(key, dict):
b1bde57b
SS
5597 iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items())
5598 result = {
5599 k: v if v is not None else default for k, v in iter_obj
5600 if v is not None or default is not NO_DEFAULT
5601 } or None
ab029d7e 5602
7b0127e1 5603 elif isinstance(obj, collections.abc.Mapping):
b1bde57b
SS
5604 result = (obj.get(key) if casesense or (key in obj) else
5605 next((v for k, v in obj.items() if casefold(k) == key), None))
ab029d7e 5606
7b0127e1
SS
5607 elif isinstance(obj, re.Match):
5608 if isinstance(key, int) or casesense:
5609 with contextlib.suppress(IndexError):
b1bde57b 5610 result = obj.group(key)
7b0127e1 5611
b1bde57b
SS
5612 elif isinstance(key, str):
5613 result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
ab029d7e 5614
b1bde57b 5615 elif isinstance(key, (int, slice)):
21b5ec86 5616 if isinstance(obj, collections.abc.Sequence) and not isinstance(obj, (str, bytes)):
b1bde57b
SS
5617 branching = isinstance(key, slice)
5618 with contextlib.suppress(IndexError):
5619 result = obj[key]
6839ae1f
SS
5620 elif traverse_string:
5621 with contextlib.suppress(IndexError):
5622 result = str(obj)[key]
ab029d7e 5623
b1bde57b 5624 return branching, result if branching else (result,)
ab029d7e 5625
776995bc
SS
5626 def lazy_last(iterable):
5627 iterator = iter(iterable)
5628 prev = next(iterator, NO_DEFAULT)
5629 if prev is NO_DEFAULT:
5630 return
5631
5632 for item in iterator:
5633 yield False, prev
5634 prev = item
5635
5636 yield True, prev
5637
b1bde57b 5638 def apply_path(start_obj, path, test_type):
ab029d7e
SS
5639 objs = (start_obj,)
5640 has_branched = False
5641
776995bc
SS
5642 key = None
5643 for last, key in lazy_last(variadic(path, (str, bytes, dict, set))):
b1bde57b
SS
5644 if is_user_input and isinstance(key, str):
5645 if key == ':':
5646 key = ...
5647 elif ':' in key:
5648 key = slice(*map(int_or_none, key.split(':')))
5649 elif int_or_none(key) is not None:
5650 key = int(key)
ab029d7e
SS
5651
5652 if not casesense and isinstance(key, str):
5653 key = key.casefold()
5654
776995bc
SS
5655 if __debug__ and callable(key):
5656 # Verify function signature
5657 inspect.signature(key).bind(None, None)
5658
b1bde57b
SS
5659 new_objs = []
5660 for obj in objs:
5661 branching, results = apply_key(key, obj, last)
5662 has_branched |= branching
5663 new_objs.append(results)
5664
5665 objs = itertools.chain.from_iterable(new_objs)
ab029d7e 5666
776995bc
SS
5667 if test_type and not isinstance(key, (dict, list, tuple)):
5668 objs = map(type_test, objs)
5669
b1bde57b 5670 return objs, has_branched, isinstance(key, dict)
ab029d7e 5671
b1bde57b
SS
5672 def _traverse_obj(obj, path, allow_empty, test_type):
5673 results, has_branched, is_dict = apply_path(obj, path, test_type)
6839ae1f 5674 results = LazyList(item for item in results if item not in (None, {}))
f99bbfc9 5675 if get_all and has_branched:
b1bde57b
SS
5676 if results:
5677 return results.exhaust()
5678 if allow_empty:
5679 return [] if default is NO_DEFAULT else default
5680 return None
f99bbfc9 5681
b1bde57b 5682 return results[0] if results else {} if allow_empty and is_dict else None
f99bbfc9
SS
5683
5684 for index, path in enumerate(paths, 1):
b1bde57b 5685 result = _traverse_obj(obj, path, index == len(paths), True)
ab029d7e
SS
5686 if result is not None:
5687 return result
5688
f99bbfc9 5689 return None if default is NO_DEFAULT else default
324ad820 5690
5691
5692def traverse_dict(dictn, keys, casesense=True):
da4db748 5693 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5694 f'in a future version. Use "{__name__}.traverse_obj" instead')
ee8dd27a 5695 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5696
5697
ff91cf74 5698def get_first(obj, keys, **kwargs):
5699 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5700
5701
3e9b66d7 5702def time_seconds(**kwargs):
83c4970e
L
5703 """
5704 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5705 """
5706 return time.time() + datetime.timedelta(**kwargs).total_seconds()
3e9b66d7
LNO
5707
5708
49fa4d9a
N
5709# create a JSON Web Signature (jws) with HS256 algorithm
5710# the resulting format is in JWS Compact Serialization
5711# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5712# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5713def jwt_encode_hs256(payload_data, key, headers={}):
5714 header_data = {
5715 'alg': 'HS256',
5716 'typ': 'JWT',
5717 }
5718 if headers:
5719 header_data.update(headers)
0f06bcd7 5720 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5721 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5722 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5723 signature_b64 = base64.b64encode(h.digest())
5724 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5725 return token
819e0531 5726
5727
16b0d7e6 5728# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5729def jwt_decode_hs256(jwt):
5730 header_b64, payload_b64, signature_b64 = jwt.split('.')
2c98d998 5731 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5732 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
16b0d7e6 5733 return payload_data
5734
5735
53973b4d 5736WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5737
5738
7a32c70d 5739@functools.cache
819e0531 5740def supports_terminal_sequences(stream):
5741 if compat_os_name == 'nt':
8a82af35 5742 if not WINDOWS_VT_MODE:
819e0531 5743 return False
5744 elif not os.getenv('TERM'):
5745 return False
5746 try:
5747 return stream.isatty()
5748 except BaseException:
5749 return False
5750
5751
c53a18f0 5752def windows_enable_vt_mode():
5753 """Ref: https://bugs.python.org/issue30075 """
8a82af35 5754 if get_windows_version() < (10, 0, 10586):
53973b4d 5755 return
53973b4d 5756
c53a18f0 5757 import ctypes
5758 import ctypes.wintypes
5759 import msvcrt
5760
5761 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5762
5763 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5764 handle = os.open('CONOUT$', os.O_RDWR)
c53a18f0 5765 try:
5766 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5767 dw_original_mode = ctypes.wintypes.DWORD()
5768 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5769 if not success:
5770 raise Exception('GetConsoleMode failed')
5771
5772 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5773 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5774 if not success:
5775 raise Exception('SetConsoleMode failed')
c53a18f0 5776 finally:
5777 os.close(handle)
53973b4d 5778
f0795149 5779 global WINDOWS_VT_MODE
5780 WINDOWS_VT_MODE = True
5781 supports_terminal_sequences.cache_clear()
5782
53973b4d 5783
ec11a9f4 5784_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5785
5786
5787def remove_terminal_sequences(string):
5788 return _terminal_sequences_re.sub('', string)
5789
5790
5791def number_of_digits(number):
5792 return len('%d' % number)
34921b43 5793
5794
5795def join_nonempty(*values, delim='-', from_dict=None):
5796 if from_dict is not None:
7b2c3f47 5797 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5798 return delim.join(map(str, filter(None, values)))
06e57990 5799
5800
27231526
ZM
5801def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5802 """
5803 Find the largest format dimensions in terms of video width and, for each thumbnail:
5804 * Modify the URL: Match the width with the provided regex and replace with the former width
5805 * Update dimensions
5806
5807 This function is useful with video services that scale the provided thumbnails on demand
5808 """
5809 _keys = ('width', 'height')
5810 max_dimensions = max(
86e5f3ed 5811 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5812 default=(0, 0))
5813 if not max_dimensions[0]:
5814 return thumbnails
5815 return [
5816 merge_dicts(
5817 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5818 dict(zip(_keys, max_dimensions)), thumbnail)
5819 for thumbnail in thumbnails
5820 ]
5821
5822
93c8410d
LNO
5823def parse_http_range(range):
5824 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5825 if not range:
5826 return None, None, None
5827 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5828 if not crg:
5829 return None, None, None
5830 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5831
5832
6b9e832d 5833def read_stdin(what):
5834 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5835 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5836 return sys.stdin
5837
5838
a904a7f8
L
5839def determine_file_encoding(data):
5840 """
88f60feb 5841 Detect the text encoding used
a904a7f8
L
5842 @returns (encoding, bytes to skip)
5843 """
5844
88f60feb 5845 # BOM marks are given priority over declarations
a904a7f8 5846 for bom, enc in BOMS:
a904a7f8
L
5847 if data.startswith(bom):
5848 return enc, len(bom)
5849
88f60feb 5850 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5851 # We ignore the endianness to get a good enough match
a904a7f8 5852 data = data.replace(b'\0', b'')
88f60feb 5853 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5854 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5855
5856
06e57990 5857class Config:
5858 own_args = None
9e491463 5859 parsed_args = None
06e57990 5860 filename = None
5861 __initialized = False
5862
5863 def __init__(self, parser, label=None):
9e491463 5864 self.parser, self.label = parser, label
06e57990 5865 self._loaded_paths, self.configs = set(), []
5866
5867 def init(self, args=None, filename=None):
5868 assert not self.__initialized
284a60c5 5869 self.own_args, self.filename = args, filename
5870 return self.load_configs()
5871
5872 def load_configs(self):
65662dff 5873 directory = ''
284a60c5 5874 if self.filename:
5875 location = os.path.realpath(self.filename)
65662dff 5876 directory = os.path.dirname(location)
06e57990 5877 if location in self._loaded_paths:
5878 return False
5879 self._loaded_paths.add(location)
5880
284a60c5 5881 self.__initialized = True
5882 opts, _ = self.parser.parse_known_args(self.own_args)
5883 self.parsed_args = self.own_args
9e491463 5884 for location in opts.config_locations or []:
6b9e832d 5885 if location == '-':
1060f82f 5886 if location in self._loaded_paths:
5887 continue
5888 self._loaded_paths.add(location)
6b9e832d 5889 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5890 continue
65662dff 5891 location = os.path.join(directory, expand_path(location))
06e57990 5892 if os.path.isdir(location):
5893 location = os.path.join(location, 'yt-dlp.conf')
5894 if not os.path.exists(location):
9e491463 5895 self.parser.error(f'config location {location} does not exist')
06e57990 5896 self.append_config(self.read_file(location), location)
5897 return True
5898
5899 def __str__(self):
5900 label = join_nonempty(
5901 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5902 delim=' ')
5903 return join_nonempty(
5904 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5905 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5906 delim='\n')
5907
7a32c70d 5908 @staticmethod
06e57990 5909 def read_file(filename, default=[]):
5910 try:
a904a7f8 5911 optionf = open(filename, 'rb')
86e5f3ed 5912 except OSError:
06e57990 5913 return default # silently skip if file is not present
a904a7f8
L
5914 try:
5915 enc, skip = determine_file_encoding(optionf.read(512))
5916 optionf.seek(skip, io.SEEK_SET)
5917 except OSError:
5918 enc = None # silently skip read errors
06e57990 5919 try:
5920 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5921 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5922 res = shlex.split(contents, comments=True)
44a6fcff 5923 except Exception as err:
5924 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5925 finally:
5926 optionf.close()
5927 return res
5928
7a32c70d 5929 @staticmethod
06e57990 5930 def hide_login_info(opts):
86e5f3ed 5931 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5932 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5933
5934 def _scrub_eq(o):
5935 m = eqre.match(o)
5936 if m:
5937 return m.group('key') + '=PRIVATE'
5938 else:
5939 return o
5940
5941 opts = list(map(_scrub_eq, opts))
5942 for idx, opt in enumerate(opts):
5943 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5944 opts[idx + 1] = 'PRIVATE'
5945 return opts
5946
5947 def append_config(self, *args, label=None):
9e491463 5948 config = type(self)(self.parser, label)
06e57990 5949 config._loaded_paths = self._loaded_paths
5950 if config.init(*args):
5951 self.configs.append(config)
5952
7a32c70d 5953 @property
06e57990 5954 def all_args(self):
5955 for config in reversed(self.configs):
5956 yield from config.all_args
9e491463 5957 yield from self.parsed_args or []
5958
5959 def parse_known_args(self, **kwargs):
5960 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5961
5962 def parse_args(self):
9e491463 5963 return self.parser.parse_args(self.all_args)
da42679b
LNO
5964
5965
d5d1df8a 5966class WebSocketsWrapper:
da42679b 5967 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5968 pool = None
da42679b 5969
3cea3edd 5970 def __init__(self, url, headers=None, connect=True):
059bc4db 5971 self.loop = asyncio.new_event_loop()
9cd08050 5972 # XXX: "loop" is deprecated
5973 self.conn = websockets.connect(
5974 url, extra_headers=headers, ping_interval=None,
5975 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5976 if connect:
5977 self.__enter__()
15dfb392 5978 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5979
5980 def __enter__(self):
3cea3edd 5981 if not self.pool:
9cd08050 5982 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5983 return self
5984
5985 def send(self, *args):
5986 self.run_with_loop(self.pool.send(*args), self.loop)
5987
5988 def recv(self, *args):
5989 return self.run_with_loop(self.pool.recv(*args), self.loop)
5990
5991 def __exit__(self, type, value, traceback):
5992 try:
5993 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5994 finally:
5995 self.loop.close()
15dfb392 5996 self._cancel_all_tasks(self.loop)
da42679b
LNO
5997
5998 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5999 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
7a32c70d 6000 @staticmethod
da42679b 6001 def run_with_loop(main, loop):
059bc4db 6002 if not asyncio.iscoroutine(main):
da42679b
LNO
6003 raise ValueError(f'a coroutine was expected, got {main!r}')
6004
6005 try:
6006 return loop.run_until_complete(main)
6007 finally:
6008 loop.run_until_complete(loop.shutdown_asyncgens())
6009 if hasattr(loop, 'shutdown_default_executor'):
6010 loop.run_until_complete(loop.shutdown_default_executor())
6011
7a32c70d 6012 @staticmethod
da42679b 6013 def _cancel_all_tasks(loop):
059bc4db 6014 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
6015
6016 if not to_cancel:
6017 return
6018
6019 for task in to_cancel:
6020 task.cancel()
6021
9cd08050 6022 # XXX: "loop" is removed in python 3.10+
da42679b 6023 loop.run_until_complete(
059bc4db 6024 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
6025
6026 for task in to_cancel:
6027 if task.cancelled():
6028 continue
6029 if task.exception() is not None:
6030 loop.call_exception_handler({
6031 'message': 'unhandled exception during asyncio.run() shutdown',
6032 'exception': task.exception(),
6033 'task': task,
6034 })
6035
6036
8b7539d2 6037def merge_headers(*dicts):
08d30158 6038 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 6039 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 6040
6041
b1f94422 6042def cached_method(f):
6043 """Cache a method"""
6044 signature = inspect.signature(f)
6045
7a32c70d 6046 @functools.wraps(f)
b1f94422 6047 def wrapper(self, *args, **kwargs):
6048 bound_args = signature.bind(self, *args, **kwargs)
6049 bound_args.apply_defaults()
d5d1df8a 6050 key = tuple(bound_args.arguments.values())[1:]
b1f94422 6051
6368e2e6 6052 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
b1f94422 6053 if key not in cache:
6054 cache[key] = f(self, *args, **kwargs)
6055 return cache[key]
6056 return wrapper
6057
6058
28787f16 6059class classproperty:
83cc7b8a 6060 """property access for class methods with optional caching"""
6061 def __new__(cls, func=None, *args, **kwargs):
6062 if not func:
6063 return functools.partial(cls, *args, **kwargs)
6064 return super().__new__(cls)
c487cf00 6065
83cc7b8a 6066 def __init__(self, func, *, cache=False):
c487cf00 6067 functools.update_wrapper(self, func)
6068 self.func = func
83cc7b8a 6069 self._cache = {} if cache else None
28787f16 6070
6071 def __get__(self, _, cls):
83cc7b8a 6072 if self._cache is None:
6073 return self.func(cls)
6074 elif cls not in self._cache:
6075 self._cache[cls] = self.func(cls)
6076 return self._cache[cls]
19a03940 6077
6078
a5387729 6079class function_with_repr:
b2e0343b 6080 def __init__(self, func, repr_=None):
a5387729 6081 functools.update_wrapper(self, func)
b2e0343b 6082 self.func, self.__repr = func, repr_
a5387729 6083
6084 def __call__(self, *args, **kwargs):
6085 return self.func(*args, **kwargs)
6086
6087 def __repr__(self):
b2e0343b 6088 if self.__repr:
6089 return self.__repr
a5387729 6090 return f'{self.func.__module__}.{self.func.__qualname__}'
6091
6092
64fa820c 6093class Namespace(types.SimpleNamespace):
591bb9d3 6094 """Immutable namespace"""
591bb9d3 6095
7896214c 6096 def __iter__(self):
64fa820c 6097 return iter(self.__dict__.values())
7896214c 6098
7a32c70d 6099 @property
64fa820c 6100 def items_(self):
6101 return self.__dict__.items()
9b8ee23b 6102
6103
8dc59305 6104MEDIA_EXTENSIONS = Namespace(
6105 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
6106 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
6107 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
fbb73833 6108 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
8dc59305 6109 thumbnails=('jpg', 'png', 'webp'),
6110 storyboards=('mhtml', ),
6111 subtitles=('srt', 'vtt', 'ass', 'lrc'),
6112 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
6113)
6114MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
6115MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
6116
6117KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
6118
6119
be5c1ae8 6120class RetryManager:
6121 """Usage:
6122 for retry in RetryManager(...):
6123 try:
6124 ...
6125 except SomeException as err:
6126 retry.error = err
6127 continue
6128 """
6129 attempt, _error = 0, None
6130
6131 def __init__(self, _retries, _error_callback, **kwargs):
6132 self.retries = _retries or 0
6133 self.error_callback = functools.partial(_error_callback, **kwargs)
6134
6135 def _should_retry(self):
6136 return self._error is not NO_DEFAULT and self.attempt <= self.retries
6137
7a32c70d 6138 @property
be5c1ae8 6139 def error(self):
6140 if self._error is NO_DEFAULT:
6141 return None
6142 return self._error
6143
7a32c70d 6144 @error.setter
be5c1ae8 6145 def error(self, value):
6146 self._error = value
6147
6148 def __iter__(self):
6149 while self._should_retry():
6150 self.error = NO_DEFAULT
6151 self.attempt += 1
6152 yield self
6153 if self.error:
6154 self.error_callback(self.error, self.attempt, self.retries)
6155
7a32c70d 6156 @staticmethod
be5c1ae8 6157 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
6158 """Utility function for reporting retries"""
6159 if count > retries:
6160 if error:
6161 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
6162 raise e
6163
6164 if not count:
6165 return warn(e)
6166 elif isinstance(e, ExtractorError):
3ce29336 6167 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 6168 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
6169
6170 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
6171 if delay:
6172 info(f'Sleeping {delay:.2f} seconds ...')
6173 time.sleep(delay)
6174
6175
0647d925 6176def make_archive_id(ie, video_id):
6177 ie_key = ie if isinstance(ie, str) else ie.ie_key()
6178 return f'{ie_key.lower()} {video_id}'
6179
6180
a1c5bd82 6181def truncate_string(s, left, right=0):
6182 assert left > 3 and right >= 0
6183 if s is None or len(s) <= left + right:
6184 return s
71df9b7f 6185 return f'{s[:left-3]}...{s[-right:] if right else ""}'
a1c5bd82 6186
6187
5314b521 6188def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6189 assert 'all' in alias_dict, '"all" alias is required'
6190 requested = list(start or [])
6191 for val in options:
6192 discard = val.startswith('-')
6193 if discard:
6194 val = val[1:]
6195
6196 if val in alias_dict:
6197 val = alias_dict[val] if not discard else [
6198 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6199 # NB: Do not allow regex in aliases for performance
6200 requested = orderedSet_from_options(val, alias_dict, start=requested)
6201 continue
6202
6203 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6204 else [val] if val in alias_dict['all'] else None)
6205 if current is None:
6206 raise ValueError(val)
6207
6208 if discard:
6209 for item in current:
6210 while item in requested:
6211 requested.remove(item)
6212 else:
6213 requested.extend(current)
6214
6215 return orderedSet(requested)
6216
6217
d0d74b71 6218class FormatSorter:
6219 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6220
6221 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6222 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6223 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6224 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6225 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6226 'fps', 'fs_approx', 'source', 'id')
6227
6228 settings = {
6229 'vcodec': {'type': 'ordered', 'regex': True,
6230 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6231 'acodec': {'type': 'ordered', 'regex': True,
71082216 6232 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
d0d74b71 6233 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6234 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6235 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6236 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6237 'vext': {'type': 'ordered', 'field': 'video_ext',
29ca4082 6238 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6239 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
fbb73833 6240 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
6241 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
6242 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
d0d74b71 6243 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6244 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6245 'field': ('vcodec', 'acodec'),
6246 'function': lambda it: int(any(v != 'none' for v in it))},
6247 'ie_pref': {'priority': True, 'type': 'extractor'},
6248 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6249 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6250 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6251 'quality': {'convert': 'float', 'default': -1},
6252 'filesize': {'convert': 'bytes'},
6253 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6254 'id': {'convert': 'string', 'field': 'format_id'},
6255 'height': {'convert': 'float_none'},
6256 'width': {'convert': 'float_none'},
6257 'fps': {'convert': 'float_none'},
6258 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6259 'tbr': {'convert': 'float_none'},
6260 'vbr': {'convert': 'float_none'},
6261 'abr': {'convert': 'float_none'},
6262 'asr': {'convert': 'float_none'},
6263 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6264
6265 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6266 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6267 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6268 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6269 'res': {'type': 'multiple', 'field': ('height', 'width'),
6270 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6271
6272 # Actual field names
6273 'format_id': {'type': 'alias', 'field': 'id'},
6274 'preference': {'type': 'alias', 'field': 'ie_pref'},
6275 'language_preference': {'type': 'alias', 'field': 'lang'},
6276 'source_preference': {'type': 'alias', 'field': 'source'},
6277 'protocol': {'type': 'alias', 'field': 'proto'},
6278 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6279 'audio_channels': {'type': 'alias', 'field': 'channels'},
6280
6281 # Deprecated
6282 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6283 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6284 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6285 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6286 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6287 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6288 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6289 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6290 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6291 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6292 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6293 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6294 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6295 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6296 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6297 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6298 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6299 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6300 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6301 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6302 }
6303
6304 def __init__(self, ydl, field_preference):
6305 self.ydl = ydl
6306 self._order = []
6307 self.evaluate_params(self.ydl.params, field_preference)
6308 if ydl.params.get('verbose'):
6309 self.print_verbose_info(self.ydl.write_debug)
6310
6311 def _get_field_setting(self, field, key):
6312 if field not in self.settings:
6313 if key in ('forced', 'priority'):
6314 return False
6315 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6316 'deprecated and may be removed in a future version')
6317 self.settings[field] = {}
6318 propObj = self.settings[field]
6319 if key not in propObj:
6320 type = propObj.get('type')
6321 if key == 'field':
6322 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6323 elif key == 'convert':
6324 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6325 else:
6326 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6327 propObj[key] = default
6328 return propObj[key]
6329
6330 def _resolve_field_value(self, field, value, convertNone=False):
6331 if value is None:
6332 if not convertNone:
6333 return None
6334 else:
6335 value = value.lower()
6336 conversion = self._get_field_setting(field, 'convert')
6337 if conversion == 'ignore':
6338 return None
6339 if conversion == 'string':
6340 return value
6341 elif conversion == 'float_none':
6342 return float_or_none(value)
6343 elif conversion == 'bytes':
6344 return parse_bytes(value)
6345 elif conversion == 'order':
6346 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6347 use_regex = self._get_field_setting(field, 'regex')
6348 list_length = len(order_list)
6349 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6350 if use_regex and value is not None:
6351 for i, regex in enumerate(order_list):
6352 if regex and re.match(regex, value):
6353 return list_length - i
6354 return list_length - empty_pos # not in list
6355 else: # not regex or value = None
6356 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6357 else:
6358 if value.isnumeric():
6359 return float(value)
6360 else:
6361 self.settings[field]['convert'] = 'string'
6362 return value
6363
6364 def evaluate_params(self, params, sort_extractor):
6365 self._use_free_order = params.get('prefer_free_formats', False)
6366 self._sort_user = params.get('format_sort', [])
6367 self._sort_extractor = sort_extractor
6368
6369 def add_item(field, reverse, closest, limit_text):
6370 field = field.lower()
6371 if field in self._order:
6372 return
6373 self._order.append(field)
6374 limit = self._resolve_field_value(field, limit_text)
6375 data = {
6376 'reverse': reverse,
6377 'closest': False if limit is None else closest,
6378 'limit_text': limit_text,
6379 'limit': limit}
6380 if field in self.settings:
6381 self.settings[field].update(data)
6382 else:
6383 self.settings[field] = data
6384
6385 sort_list = (
6386 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6387 + (tuple() if params.get('format_sort_force', False)
6388 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6389 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6390
6391 for item in sort_list:
6392 match = re.match(self.regex, item)
6393 if match is None:
6394 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6395 field = match.group('field')
6396 if field is None:
6397 continue
6398 if self._get_field_setting(field, 'type') == 'alias':
6399 alias, field = field, self._get_field_setting(field, 'field')
6400 if self._get_field_setting(alias, 'deprecated'):
6401 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6402 f'be removed in a future version. Please use {field} instead')
6403 reverse = match.group('reverse') is not None
6404 closest = match.group('separator') == '~'
6405 limit_text = match.group('limit')
6406
6407 has_limit = limit_text is not None
6408 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6409 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6410
6411 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6412 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6413 limit_count = len(limits)
6414 for (i, f) in enumerate(fields):
6415 add_item(f, reverse, closest,
6416 limits[i] if i < limit_count
6417 else limits[0] if has_limit and not has_multiple_limits
6418 else None)
6419
6420 def print_verbose_info(self, write_debug):
6421 if self._sort_user:
6422 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6423 if self._sort_extractor:
6424 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6425 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6426 '+' if self._get_field_setting(field, 'reverse') else '', field,
6427 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6428 self._get_field_setting(field, 'limit_text'),
6429 self._get_field_setting(field, 'limit'))
6430 if self._get_field_setting(field, 'limit_text') is not None else '')
6431 for field in self._order if self._get_field_setting(field, 'visible')]))
6432
6433 def _calculate_field_preference_from_value(self, format, field, type, value):
6434 reverse = self._get_field_setting(field, 'reverse')
6435 closest = self._get_field_setting(field, 'closest')
6436 limit = self._get_field_setting(field, 'limit')
6437
6438 if type == 'extractor':
6439 maximum = self._get_field_setting(field, 'max')
6440 if value is None or (maximum is not None and value >= maximum):
6441 value = -1
6442 elif type == 'boolean':
6443 in_list = self._get_field_setting(field, 'in_list')
6444 not_in_list = self._get_field_setting(field, 'not_in_list')
6445 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6446 elif type == 'ordered':
6447 value = self._resolve_field_value(field, value, True)
6448
6449 # try to convert to number
6450 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6451 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6452 if is_num:
6453 value = val_num
6454
6455 return ((-10, 0) if value is None
6456 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6457 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6458 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6459 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6460 else (-1, value, 0))
6461
6462 def _calculate_field_preference(self, format, field):
6463 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6464 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6465 if type == 'multiple':
6466 type = 'field' # Only 'field' is allowed in multiple for now
6467 actual_fields = self._get_field_setting(field, 'field')
6468
6469 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6470 else:
6471 value = get_value(field)
6472 return self._calculate_field_preference_from_value(format, field, type, value)
6473
6474 def calculate_preference(self, format):
6475 # Determine missing protocol
6476 if not format.get('protocol'):
6477 format['protocol'] = determine_protocol(format)
6478
6479 # Determine missing ext
6480 if not format.get('ext') and 'url' in format:
6481 format['ext'] = determine_ext(format['url'])
6482 if format.get('vcodec') == 'none':
6483 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6484 format['video_ext'] = 'none'
6485 else:
6486 format['video_ext'] = format['ext']
6487 format['audio_ext'] = 'none'
6488 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6489 # format['preference'] = -1000
6490
5424dbaf
L
6491 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6492 # HEVC-over-FLV is out-of-spec by FLV's original spec
6493 # ref. https://trac.ffmpeg.org/ticket/6389
6494 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6495 format['preference'] = -100
6496
d0d74b71 6497 # Determine missing bitrates
6498 if format.get('tbr') is None:
6499 if format.get('vbr') is not None and format.get('abr') is not None:
6500 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6501 else:
6502 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6503 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6504 if format.get('acodec') != 'none' and format.get('abr') is None:
6505 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6506
6507 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6508
6509
9b8ee23b 6510# Deprecated
6511has_certifi = bool(certifi)
6512has_websockets = bool(websockets)
8e40b9d1
M
6513
6514
6515def load_plugins(name, suffix, namespace):
6516 from .plugins import load_plugins
6517 ret = load_plugins(name, suffix)
6518 namespace.update(ret)
6519 return ret