]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils/_utils.py
[networking] Add module (#2861)
[yt-dlp.git] / yt_dlp / utils / _utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import hashlib
15 import hmac
16 import html.entities
17 import html.parser
18 import http.client
19 import http.cookiejar
20 import inspect
21 import io
22 import itertools
23 import json
24 import locale
25 import math
26 import mimetypes
27 import netrc
28 import operator
29 import os
30 import platform
31 import random
32 import re
33 import shlex
34 import socket
35 import ssl
36 import struct
37 import subprocess
38 import sys
39 import tempfile
40 import time
41 import traceback
42 import types
43 import unicodedata
44 import urllib.error
45 import urllib.parse
46 import urllib.request
47 import xml.etree.ElementTree
48
49 from . import traversal
50
51 from ..compat import functools # isort: split
52 from ..compat import (
53 compat_etree_fromstring,
54 compat_expanduser,
55 compat_HTMLParseError,
56 compat_os_name,
57 compat_shlex_quote,
58 )
59 from ..dependencies import websockets, xattr
60
61 __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
62
63 # This is not clearly defined otherwise
64 compiled_regex_type = type(re.compile(''))
65
66
67 USER_AGENTS = {
68 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
69 }
70
71
72 class NO_DEFAULT:
73 pass
74
75
76 def IDENTITY(x):
77 return x
78
79
80 ENGLISH_MONTH_NAMES = [
81 'January', 'February', 'March', 'April', 'May', 'June',
82 'July', 'August', 'September', 'October', 'November', 'December']
83
84 MONTH_NAMES = {
85 'en': ENGLISH_MONTH_NAMES,
86 'fr': [
87 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
88 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
89 # these follow the genitive grammatical case (dopełniacz)
90 # some websites might be using nominative, which will require another month list
91 # https://en.wikibooks.org/wiki/Polish/Noun_cases
92 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
93 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
94 }
95
96 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
97 TIMEZONE_NAMES = {
98 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
99 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
100 'EST': -5, 'EDT': -4, # Eastern
101 'CST': -6, 'CDT': -5, # Central
102 'MST': -7, 'MDT': -6, # Mountain
103 'PST': -8, 'PDT': -7 # Pacific
104 }
105
106 # needed for sanitizing filenames in restricted mode
107 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
108 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
109 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
110
111 DATE_FORMATS = (
112 '%d %B %Y',
113 '%d %b %Y',
114 '%B %d %Y',
115 '%B %dst %Y',
116 '%B %dnd %Y',
117 '%B %drd %Y',
118 '%B %dth %Y',
119 '%b %d %Y',
120 '%b %dst %Y',
121 '%b %dnd %Y',
122 '%b %drd %Y',
123 '%b %dth %Y',
124 '%b %dst %Y %I:%M',
125 '%b %dnd %Y %I:%M',
126 '%b %drd %Y %I:%M',
127 '%b %dth %Y %I:%M',
128 '%Y %m %d',
129 '%Y-%m-%d',
130 '%Y.%m.%d.',
131 '%Y/%m/%d',
132 '%Y/%m/%d %H:%M',
133 '%Y/%m/%d %H:%M:%S',
134 '%Y%m%d%H%M',
135 '%Y%m%d%H%M%S',
136 '%Y%m%d',
137 '%Y-%m-%d %H:%M',
138 '%Y-%m-%d %H:%M:%S',
139 '%Y-%m-%d %H:%M:%S.%f',
140 '%Y-%m-%d %H:%M:%S:%f',
141 '%d.%m.%Y %H:%M',
142 '%d.%m.%Y %H.%M',
143 '%Y-%m-%dT%H:%M:%SZ',
144 '%Y-%m-%dT%H:%M:%S.%fZ',
145 '%Y-%m-%dT%H:%M:%S.%f0Z',
146 '%Y-%m-%dT%H:%M:%S',
147 '%Y-%m-%dT%H:%M:%S.%f',
148 '%Y-%m-%dT%H:%M',
149 '%b %d %Y at %H:%M',
150 '%b %d %Y at %H:%M:%S',
151 '%B %d %Y at %H:%M',
152 '%B %d %Y at %H:%M:%S',
153 '%H:%M %d-%b-%Y',
154 )
155
156 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
157 DATE_FORMATS_DAY_FIRST.extend([
158 '%d-%m-%Y',
159 '%d.%m.%Y',
160 '%d.%m.%y',
161 '%d/%m/%Y',
162 '%d/%m/%y',
163 '%d/%m/%Y %H:%M:%S',
164 '%d-%m-%Y %H:%M',
165 '%H:%M %d/%m/%Y',
166 ])
167
168 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
169 DATE_FORMATS_MONTH_FIRST.extend([
170 '%m-%d-%Y',
171 '%m.%d.%Y',
172 '%m/%d/%Y',
173 '%m/%d/%y',
174 '%m/%d/%Y %H:%M:%S',
175 ])
176
177 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
178 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
179
180 NUMBER_RE = r'\d+(?:\.\d+)?'
181
182
183 @functools.cache
184 def preferredencoding():
185 """Get preferred encoding.
186
187 Returns the best encoding scheme for the system, based on
188 locale.getpreferredencoding() and some further tweaks.
189 """
190 try:
191 pref = locale.getpreferredencoding()
192 'TEST'.encode(pref)
193 except Exception:
194 pref = 'UTF-8'
195
196 return pref
197
198
199 def write_json_file(obj, fn):
200 """ Encode obj as JSON and write it to fn, atomically if possible """
201
202 tf = tempfile.NamedTemporaryFile(
203 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
204 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
205
206 try:
207 with tf:
208 json.dump(obj, tf, ensure_ascii=False)
209 if sys.platform == 'win32':
210 # Need to remove existing file on Windows, else os.rename raises
211 # WindowsError or FileExistsError.
212 with contextlib.suppress(OSError):
213 os.unlink(fn)
214 with contextlib.suppress(OSError):
215 mask = os.umask(0)
216 os.umask(mask)
217 os.chmod(tf.name, 0o666 & ~mask)
218 os.rename(tf.name, fn)
219 except Exception:
220 with contextlib.suppress(OSError):
221 os.remove(tf.name)
222 raise
223
224
225 def find_xpath_attr(node, xpath, key, val=None):
226 """ Find the xpath xpath[@key=val] """
227 assert re.match(r'^[a-zA-Z_-]+$', key)
228 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
229 return node.find(expr)
230
231 # On python2.6 the xml.etree.ElementTree.Element methods don't support
232 # the namespace parameter
233
234
235 def xpath_with_ns(path, ns_map):
236 components = [c.split(':') for c in path.split('/')]
237 replaced = []
238 for c in components:
239 if len(c) == 1:
240 replaced.append(c[0])
241 else:
242 ns, tag = c
243 replaced.append('{%s}%s' % (ns_map[ns], tag))
244 return '/'.join(replaced)
245
246
247 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
248 def _find_xpath(xpath):
249 return node.find(xpath)
250
251 if isinstance(xpath, str):
252 n = _find_xpath(xpath)
253 else:
254 for xp in xpath:
255 n = _find_xpath(xp)
256 if n is not None:
257 break
258
259 if n is None:
260 if default is not NO_DEFAULT:
261 return default
262 elif fatal:
263 name = xpath if name is None else name
264 raise ExtractorError('Could not find XML element %s' % name)
265 else:
266 return None
267 return n
268
269
270 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
271 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
272 if n is None or n == default:
273 return n
274 if n.text is None:
275 if default is not NO_DEFAULT:
276 return default
277 elif fatal:
278 name = xpath if name is None else name
279 raise ExtractorError('Could not find XML element\'s text %s' % name)
280 else:
281 return None
282 return n.text
283
284
285 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
286 n = find_xpath_attr(node, xpath, key)
287 if n is None:
288 if default is not NO_DEFAULT:
289 return default
290 elif fatal:
291 name = f'{xpath}[@{key}]' if name is None else name
292 raise ExtractorError('Could not find XML attribute %s' % name)
293 else:
294 return None
295 return n.attrib[key]
296
297
298 def get_element_by_id(id, html, **kwargs):
299 """Return the content of the tag with the specified ID in the passed HTML document"""
300 return get_element_by_attribute('id', id, html, **kwargs)
301
302
303 def get_element_html_by_id(id, html, **kwargs):
304 """Return the html of the tag with the specified ID in the passed HTML document"""
305 return get_element_html_by_attribute('id', id, html, **kwargs)
306
307
308 def get_element_by_class(class_name, html):
309 """Return the content of the first tag with the specified class in the passed HTML document"""
310 retval = get_elements_by_class(class_name, html)
311 return retval[0] if retval else None
312
313
314 def get_element_html_by_class(class_name, html):
315 """Return the html of the first tag with the specified class in the passed HTML document"""
316 retval = get_elements_html_by_class(class_name, html)
317 return retval[0] if retval else None
318
319
320 def get_element_by_attribute(attribute, value, html, **kwargs):
321 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
322 return retval[0] if retval else None
323
324
325 def get_element_html_by_attribute(attribute, value, html, **kargs):
326 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
327 return retval[0] if retval else None
328
329
330 def get_elements_by_class(class_name, html, **kargs):
331 """Return the content of all tags with the specified class in the passed HTML document as a list"""
332 return get_elements_by_attribute(
333 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
334 html, escape_value=False)
335
336
337 def get_elements_html_by_class(class_name, html):
338 """Return the html of all tags with the specified class in the passed HTML document as a list"""
339 return get_elements_html_by_attribute(
340 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
341 html, escape_value=False)
342
343
344 def get_elements_by_attribute(*args, **kwargs):
345 """Return the content of the tag with the specified attribute in the passed HTML document"""
346 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
347
348
349 def get_elements_html_by_attribute(*args, **kwargs):
350 """Return the html of the tag with the specified attribute in the passed HTML document"""
351 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
352
353
354 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
355 """
356 Return the text (content) and the html (whole) of the tag with the specified
357 attribute in the passed HTML document
358 """
359 if not value:
360 return
361
362 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
363
364 value = re.escape(value) if escape_value else value
365
366 partial_element_re = rf'''(?x)
367 <(?P<tag>{tag})
368 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
369 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
370 '''
371
372 for m in re.finditer(partial_element_re, html):
373 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
374
375 yield (
376 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
377 whole
378 )
379
380
381 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
382 """
383 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
384 closing tag for the first opening tag it has encountered, and can be used
385 as a context manager
386 """
387
388 class HTMLBreakOnClosingTagException(Exception):
389 pass
390
391 def __init__(self):
392 self.tagstack = collections.deque()
393 html.parser.HTMLParser.__init__(self)
394
395 def __enter__(self):
396 return self
397
398 def __exit__(self, *_):
399 self.close()
400
401 def close(self):
402 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
403 # so data remains buffered; we no longer have any interest in it, thus
404 # override this method to discard it
405 pass
406
407 def handle_starttag(self, tag, _):
408 self.tagstack.append(tag)
409
410 def handle_endtag(self, tag):
411 if not self.tagstack:
412 raise compat_HTMLParseError('no tags in the stack')
413 while self.tagstack:
414 inner_tag = self.tagstack.pop()
415 if inner_tag == tag:
416 break
417 else:
418 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
419 if not self.tagstack:
420 raise self.HTMLBreakOnClosingTagException()
421
422
423 # XXX: This should be far less strict
424 def get_element_text_and_html_by_tag(tag, html):
425 """
426 For the first element with the specified tag in the passed HTML document
427 return its' content (text) and the whole element (html)
428 """
429 def find_or_raise(haystack, needle, exc):
430 try:
431 return haystack.index(needle)
432 except ValueError:
433 raise exc
434 closing_tag = f'</{tag}>'
435 whole_start = find_or_raise(
436 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
437 content_start = find_or_raise(
438 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
439 content_start += whole_start + 1
440 with HTMLBreakOnClosingTagParser() as parser:
441 parser.feed(html[whole_start:content_start])
442 if not parser.tagstack or parser.tagstack[0] != tag:
443 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
444 offset = content_start
445 while offset < len(html):
446 next_closing_tag_start = find_or_raise(
447 html[offset:], closing_tag,
448 compat_HTMLParseError(f'closing {tag} tag not found'))
449 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
450 try:
451 parser.feed(html[offset:offset + next_closing_tag_end])
452 offset += next_closing_tag_end
453 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
454 return html[content_start:offset + next_closing_tag_start], \
455 html[whole_start:offset + next_closing_tag_end]
456 raise compat_HTMLParseError('unexpected end of html')
457
458
459 class HTMLAttributeParser(html.parser.HTMLParser):
460 """Trivial HTML parser to gather the attributes for a single element"""
461
462 def __init__(self):
463 self.attrs = {}
464 html.parser.HTMLParser.__init__(self)
465
466 def handle_starttag(self, tag, attrs):
467 self.attrs = dict(attrs)
468 raise compat_HTMLParseError('done')
469
470
471 class HTMLListAttrsParser(html.parser.HTMLParser):
472 """HTML parser to gather the attributes for the elements of a list"""
473
474 def __init__(self):
475 html.parser.HTMLParser.__init__(self)
476 self.items = []
477 self._level = 0
478
479 def handle_starttag(self, tag, attrs):
480 if tag == 'li' and self._level == 0:
481 self.items.append(dict(attrs))
482 self._level += 1
483
484 def handle_endtag(self, tag):
485 self._level -= 1
486
487
488 def extract_attributes(html_element):
489 """Given a string for an HTML element such as
490 <el
491 a="foo" B="bar" c="&98;az" d=boz
492 empty= noval entity="&amp;"
493 sq='"' dq="'"
494 >
495 Decode and return a dictionary of attributes.
496 {
497 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
498 'empty': '', 'noval': None, 'entity': '&',
499 'sq': '"', 'dq': '\''
500 }.
501 """
502 parser = HTMLAttributeParser()
503 with contextlib.suppress(compat_HTMLParseError):
504 parser.feed(html_element)
505 parser.close()
506 return parser.attrs
507
508
509 def parse_list(webpage):
510 """Given a string for an series of HTML <li> elements,
511 return a dictionary of their attributes"""
512 parser = HTMLListAttrsParser()
513 parser.feed(webpage)
514 parser.close()
515 return parser.items
516
517
518 def clean_html(html):
519 """Clean an HTML snippet into a readable string"""
520
521 if html is None: # Convenience for sanitizing descriptions etc.
522 return html
523
524 html = re.sub(r'\s+', ' ', html)
525 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
526 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
527 # Strip html tags
528 html = re.sub('<.*?>', '', html)
529 # Replace html entities
530 html = unescapeHTML(html)
531 return html.strip()
532
533
534 class LenientJSONDecoder(json.JSONDecoder):
535 # TODO: Write tests
536 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
537 self.transform_source, self.ignore_extra = transform_source, ignore_extra
538 self._close_attempts = 2 * close_objects
539 super().__init__(*args, **kwargs)
540
541 @staticmethod
542 def _close_object(err):
543 doc = err.doc[:err.pos]
544 # We need to add comma first to get the correct error message
545 if err.msg.startswith('Expecting \',\''):
546 return doc + ','
547 elif not doc.endswith(','):
548 return
549
550 if err.msg.startswith('Expecting property name'):
551 return doc[:-1] + '}'
552 elif err.msg.startswith('Expecting value'):
553 return doc[:-1] + ']'
554
555 def decode(self, s):
556 if self.transform_source:
557 s = self.transform_source(s)
558 for attempt in range(self._close_attempts + 1):
559 try:
560 if self.ignore_extra:
561 return self.raw_decode(s.lstrip())[0]
562 return super().decode(s)
563 except json.JSONDecodeError as e:
564 if e.pos is None:
565 raise
566 elif attempt < self._close_attempts:
567 s = self._close_object(e)
568 if s is not None:
569 continue
570 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
571 assert False, 'Too many attempts to decode JSON'
572
573
574 def sanitize_open(filename, open_mode):
575 """Try to open the given filename, and slightly tweak it if this fails.
576
577 Attempts to open the given filename. If this fails, it tries to change
578 the filename slightly, step by step, until it's either able to open it
579 or it fails and raises a final exception, like the standard open()
580 function.
581
582 It returns the tuple (stream, definitive_file_name).
583 """
584 if filename == '-':
585 if sys.platform == 'win32':
586 import msvcrt
587
588 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
589 with contextlib.suppress(io.UnsupportedOperation):
590 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
591 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
592
593 for attempt in range(2):
594 try:
595 try:
596 if sys.platform == 'win32':
597 # FIXME: An exclusive lock also locks the file from being read.
598 # Since windows locks are mandatory, don't lock the file on windows (for now).
599 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
600 raise LockingUnsupportedError()
601 stream = locked_file(filename, open_mode, block=False).__enter__()
602 except OSError:
603 stream = open(filename, open_mode)
604 return stream, filename
605 except OSError as err:
606 if attempt or err.errno in (errno.EACCES,):
607 raise
608 old_filename, filename = filename, sanitize_path(filename)
609 if old_filename == filename:
610 raise
611
612
613 def timeconvert(timestr):
614 """Convert RFC 2822 defined time string into system timestamp"""
615 timestamp = None
616 timetuple = email.utils.parsedate_tz(timestr)
617 if timetuple is not None:
618 timestamp = email.utils.mktime_tz(timetuple)
619 return timestamp
620
621
622 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
623 """Sanitizes a string so it could be used as part of a filename.
624 @param restricted Use a stricter subset of allowed characters
625 @param is_id Whether this is an ID that should be kept unchanged if possible.
626 If unset, yt-dlp's new sanitization rules are in effect
627 """
628 if s == '':
629 return ''
630
631 def replace_insane(char):
632 if restricted and char in ACCENT_CHARS:
633 return ACCENT_CHARS[char]
634 elif not restricted and char == '\n':
635 return '\0 '
636 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
637 # Replace with their full-width unicode counterparts
638 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
639 elif char == '?' or ord(char) < 32 or ord(char) == 127:
640 return ''
641 elif char == '"':
642 return '' if restricted else '\''
643 elif char == ':':
644 return '\0_\0-' if restricted else '\0 \0-'
645 elif char in '\\/|*<>':
646 return '\0_'
647 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
648 return '\0_'
649 return char
650
651 # Replace look-alike Unicode glyphs
652 if restricted and (is_id is NO_DEFAULT or not is_id):
653 s = unicodedata.normalize('NFKC', s)
654 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
655 result = ''.join(map(replace_insane, s))
656 if is_id is NO_DEFAULT:
657 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
658 STRIP_RE = r'(?:\0.|[ _-])*'
659 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
660 result = result.replace('\0', '') or '_'
661
662 if not is_id:
663 while '__' in result:
664 result = result.replace('__', '_')
665 result = result.strip('_')
666 # Common case of "Foreign band name - English song title"
667 if restricted and result.startswith('-_'):
668 result = result[2:]
669 if result.startswith('-'):
670 result = '_' + result[len('-'):]
671 result = result.lstrip('.')
672 if not result:
673 result = '_'
674 return result
675
676
677 def sanitize_path(s, force=False):
678 """Sanitizes and normalizes path on Windows"""
679 if sys.platform == 'win32':
680 force = False
681 drive_or_unc, _ = os.path.splitdrive(s)
682 elif force:
683 drive_or_unc = ''
684 else:
685 return s
686
687 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
688 if drive_or_unc:
689 norm_path.pop(0)
690 sanitized_path = [
691 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
692 for path_part in norm_path]
693 if drive_or_unc:
694 sanitized_path.insert(0, drive_or_unc + os.path.sep)
695 elif force and s and s[0] == os.path.sep:
696 sanitized_path.insert(0, os.path.sep)
697 return os.path.join(*sanitized_path)
698
699
700 def sanitize_url(url, *, scheme='http'):
701 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
702 # the number of unwanted failures due to missing protocol
703 if url is None:
704 return
705 elif url.startswith('//'):
706 return f'{scheme}:{url}'
707 # Fix some common typos seen so far
708 COMMON_TYPOS = (
709 # https://github.com/ytdl-org/youtube-dl/issues/15649
710 (r'^httpss://', r'https://'),
711 # https://bx1.be/lives/direct-tv/
712 (r'^rmtp([es]?)://', r'rtmp\1://'),
713 )
714 for mistake, fixup in COMMON_TYPOS:
715 if re.match(mistake, url):
716 return re.sub(mistake, fixup, url)
717 return url
718
719
720 def extract_basic_auth(url):
721 parts = urllib.parse.urlsplit(url)
722 if parts.username is None:
723 return url, None
724 url = urllib.parse.urlunsplit(parts._replace(netloc=(
725 parts.hostname if parts.port is None
726 else '%s:%d' % (parts.hostname, parts.port))))
727 auth_payload = base64.b64encode(
728 ('%s:%s' % (parts.username, parts.password or '')).encode())
729 return url, f'Basic {auth_payload.decode()}'
730
731
732 def sanitized_Request(url, *args, **kwargs):
733 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
734 if auth_header is not None:
735 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
736 headers['Authorization'] = auth_header
737 return urllib.request.Request(url, *args, **kwargs)
738
739
740 def expand_path(s):
741 """Expand shell variables and ~"""
742 return os.path.expandvars(compat_expanduser(s))
743
744
745 def orderedSet(iterable, *, lazy=False):
746 """Remove all duplicates from the input iterable"""
747 def _iter():
748 seen = [] # Do not use set since the items can be unhashable
749 for x in iterable:
750 if x not in seen:
751 seen.append(x)
752 yield x
753
754 return _iter() if lazy else list(_iter())
755
756
757 def _htmlentity_transform(entity_with_semicolon):
758 """Transforms an HTML entity to a character."""
759 entity = entity_with_semicolon[:-1]
760
761 # Known non-numeric HTML entity
762 if entity in html.entities.name2codepoint:
763 return chr(html.entities.name2codepoint[entity])
764
765 # TODO: HTML5 allows entities without a semicolon.
766 # E.g. '&Eacuteric' should be decoded as 'Éric'.
767 if entity_with_semicolon in html.entities.html5:
768 return html.entities.html5[entity_with_semicolon]
769
770 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
771 if mobj is not None:
772 numstr = mobj.group(1)
773 if numstr.startswith('x'):
774 base = 16
775 numstr = '0%s' % numstr
776 else:
777 base = 10
778 # See https://github.com/ytdl-org/youtube-dl/issues/7518
779 with contextlib.suppress(ValueError):
780 return chr(int(numstr, base))
781
782 # Unknown entity in name, return its literal representation
783 return '&%s;' % entity
784
785
786 def unescapeHTML(s):
787 if s is None:
788 return None
789 assert isinstance(s, str)
790
791 return re.sub(
792 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
793
794
795 def escapeHTML(text):
796 return (
797 text
798 .replace('&', '&amp;')
799 .replace('<', '&lt;')
800 .replace('>', '&gt;')
801 .replace('"', '&quot;')
802 .replace("'", '&#39;')
803 )
804
805
806 class netrc_from_content(netrc.netrc):
807 def __init__(self, content):
808 self.hosts, self.macros = {}, {}
809 with io.StringIO(content) as stream:
810 self._parse('-', stream, False)
811
812
813 class Popen(subprocess.Popen):
814 if sys.platform == 'win32':
815 _startupinfo = subprocess.STARTUPINFO()
816 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
817 else:
818 _startupinfo = None
819
820 @staticmethod
821 def _fix_pyinstaller_ld_path(env):
822 """Restore LD_LIBRARY_PATH when using PyInstaller
823 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
824 https://github.com/yt-dlp/yt-dlp/issues/4573
825 """
826 if not hasattr(sys, '_MEIPASS'):
827 return
828
829 def _fix(key):
830 orig = env.get(f'{key}_ORIG')
831 if orig is None:
832 env.pop(key, None)
833 else:
834 env[key] = orig
835
836 _fix('LD_LIBRARY_PATH') # Linux
837 _fix('DYLD_LIBRARY_PATH') # macOS
838
839 def __init__(self, *args, env=None, text=False, **kwargs):
840 if env is None:
841 env = os.environ.copy()
842 self._fix_pyinstaller_ld_path(env)
843
844 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
845 if text is True:
846 kwargs['universal_newlines'] = True # For 3.6 compatibility
847 kwargs.setdefault('encoding', 'utf-8')
848 kwargs.setdefault('errors', 'replace')
849 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
850
851 def communicate_or_kill(self, *args, **kwargs):
852 try:
853 return self.communicate(*args, **kwargs)
854 except BaseException: # Including KeyboardInterrupt
855 self.kill(timeout=None)
856 raise
857
858 def kill(self, *, timeout=0):
859 super().kill()
860 if timeout != 0:
861 self.wait(timeout=timeout)
862
863 @classmethod
864 def run(cls, *args, timeout=None, **kwargs):
865 with cls(*args, **kwargs) as proc:
866 default = '' if proc.__text_mode else b''
867 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
868 return stdout or default, stderr or default, proc.returncode
869
870
871 def encodeArgument(s):
872 # Legacy code that uses byte strings
873 # Uncomment the following line after fixing all post processors
874 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
875 return s if isinstance(s, str) else s.decode('ascii')
876
877
878 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
879
880
881 def timetuple_from_msec(msec):
882 secs, msec = divmod(msec, 1000)
883 mins, secs = divmod(secs, 60)
884 hrs, mins = divmod(mins, 60)
885 return _timetuple(hrs, mins, secs, msec)
886
887
888 def formatSeconds(secs, delim=':', msec=False):
889 time = timetuple_from_msec(secs * 1000)
890 if time.hours:
891 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
892 elif time.minutes:
893 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
894 else:
895 ret = '%d' % time.seconds
896 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
897
898
899 def make_HTTPS_handler(params, **kwargs):
900 from ..networking._helper import make_ssl_context
901 return YoutubeDLHTTPSHandler(params, context=make_ssl_context(
902 verify=not params.get('nocheckcertificate'),
903 client_certificate=params.get('client_certificate'),
904 client_certificate_key=params.get('client_certificate_key'),
905 client_certificate_password=params.get('client_certificate_password'),
906 legacy_support=params.get('legacyserverconnect'),
907 use_certifi='no-certifi' not in params.get('compat_opts', []),
908 ), **kwargs)
909
910
911 def bug_reports_message(before=';'):
912 from ..update import REPOSITORY
913
914 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
915 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
916
917 before = before.rstrip()
918 if not before or before.endswith(('.', '!', '?')):
919 msg = msg[0].title() + msg[1:]
920
921 return (before + ' ' if before else '') + msg
922
923
924 class YoutubeDLError(Exception):
925 """Base exception for YoutubeDL errors."""
926 msg = None
927
928 def __init__(self, msg=None):
929 if msg is not None:
930 self.msg = msg
931 elif self.msg is None:
932 self.msg = type(self).__name__
933 super().__init__(self.msg)
934
935
936 class ExtractorError(YoutubeDLError):
937 """Error during info extraction."""
938
939 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
940 """ tb, if given, is the original traceback (so that it can be printed out).
941 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
942 """
943 from ..networking.exceptions import network_exceptions
944 if sys.exc_info()[0] in network_exceptions:
945 expected = True
946
947 self.orig_msg = str(msg)
948 self.traceback = tb
949 self.expected = expected
950 self.cause = cause
951 self.video_id = video_id
952 self.ie = ie
953 self.exc_info = sys.exc_info() # preserve original exception
954 if isinstance(self.exc_info[1], ExtractorError):
955 self.exc_info = self.exc_info[1].exc_info
956 super().__init__(self.__msg)
957
958 @property
959 def __msg(self):
960 return ''.join((
961 format_field(self.ie, None, '[%s] '),
962 format_field(self.video_id, None, '%s: '),
963 self.orig_msg,
964 format_field(self.cause, None, ' (caused by %r)'),
965 '' if self.expected else bug_reports_message()))
966
967 def format_traceback(self):
968 return join_nonempty(
969 self.traceback and ''.join(traceback.format_tb(self.traceback)),
970 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
971 delim='\n') or None
972
973 def __setattr__(self, name, value):
974 super().__setattr__(name, value)
975 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
976 self.msg = self.__msg or type(self).__name__
977 self.args = (self.msg, ) # Cannot be property
978
979
980 class UnsupportedError(ExtractorError):
981 def __init__(self, url):
982 super().__init__(
983 'Unsupported URL: %s' % url, expected=True)
984 self.url = url
985
986
987 class RegexNotFoundError(ExtractorError):
988 """Error when a regex didn't match"""
989 pass
990
991
992 class GeoRestrictedError(ExtractorError):
993 """Geographic restriction Error exception.
994
995 This exception may be thrown when a video is not available from your
996 geographic location due to geographic restrictions imposed by a website.
997 """
998
999 def __init__(self, msg, countries=None, **kwargs):
1000 kwargs['expected'] = True
1001 super().__init__(msg, **kwargs)
1002 self.countries = countries
1003
1004
1005 class UserNotLive(ExtractorError):
1006 """Error when a channel/user is not live"""
1007
1008 def __init__(self, msg=None, **kwargs):
1009 kwargs['expected'] = True
1010 super().__init__(msg or 'The channel is not currently live', **kwargs)
1011
1012
1013 class DownloadError(YoutubeDLError):
1014 """Download Error exception.
1015
1016 This exception may be thrown by FileDownloader objects if they are not
1017 configured to continue on errors. They will contain the appropriate
1018 error message.
1019 """
1020
1021 def __init__(self, msg, exc_info=None):
1022 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1023 super().__init__(msg)
1024 self.exc_info = exc_info
1025
1026
1027 class EntryNotInPlaylist(YoutubeDLError):
1028 """Entry not in playlist exception.
1029
1030 This exception will be thrown by YoutubeDL when a requested entry
1031 is not found in the playlist info_dict
1032 """
1033 msg = 'Entry not found in info'
1034
1035
1036 class SameFileError(YoutubeDLError):
1037 """Same File exception.
1038
1039 This exception will be thrown by FileDownloader objects if they detect
1040 multiple files would have to be downloaded to the same file on disk.
1041 """
1042 msg = 'Fixed output name but more than one file to download'
1043
1044 def __init__(self, filename=None):
1045 if filename is not None:
1046 self.msg += f': {filename}'
1047 super().__init__(self.msg)
1048
1049
1050 class PostProcessingError(YoutubeDLError):
1051 """Post Processing exception.
1052
1053 This exception may be raised by PostProcessor's .run() method to
1054 indicate an error in the postprocessing task.
1055 """
1056
1057
1058 class DownloadCancelled(YoutubeDLError):
1059 """ Exception raised when the download queue should be interrupted """
1060 msg = 'The download was cancelled'
1061
1062
1063 class ExistingVideoReached(DownloadCancelled):
1064 """ --break-on-existing triggered """
1065 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1066
1067
1068 class RejectedVideoReached(DownloadCancelled):
1069 """ --break-match-filter triggered """
1070 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1071
1072
1073 class MaxDownloadsReached(DownloadCancelled):
1074 """ --max-downloads limit has been reached. """
1075 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1076
1077
1078 class ReExtractInfo(YoutubeDLError):
1079 """ Video info needs to be re-extracted. """
1080
1081 def __init__(self, msg, expected=False):
1082 super().__init__(msg)
1083 self.expected = expected
1084
1085
1086 class ThrottledDownload(ReExtractInfo):
1087 """ Download speed below --throttled-rate. """
1088 msg = 'The download speed is below throttle limit'
1089
1090 def __init__(self):
1091 super().__init__(self.msg, expected=False)
1092
1093
1094 class UnavailableVideoError(YoutubeDLError):
1095 """Unavailable Format exception.
1096
1097 This exception will be thrown when a video is requested
1098 in a format that is not available for that video.
1099 """
1100 msg = 'Unable to download video'
1101
1102 def __init__(self, err=None):
1103 if err is not None:
1104 self.msg += f': {err}'
1105 super().__init__(self.msg)
1106
1107
1108 class ContentTooShortError(YoutubeDLError):
1109 """Content Too Short exception.
1110
1111 This exception may be raised by FileDownloader objects when a file they
1112 download is too small for what the server announced first, indicating
1113 the connection was probably interrupted.
1114 """
1115
1116 def __init__(self, downloaded, expected):
1117 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1118 # Both in bytes
1119 self.downloaded = downloaded
1120 self.expected = expected
1121
1122
1123 class XAttrMetadataError(YoutubeDLError):
1124 def __init__(self, code=None, msg='Unknown error'):
1125 super().__init__(msg)
1126 self.code = code
1127 self.msg = msg
1128
1129 # Parsing code and msg
1130 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1131 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1132 self.reason = 'NO_SPACE'
1133 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1134 self.reason = 'VALUE_TOO_LONG'
1135 else:
1136 self.reason = 'NOT_SUPPORTED'
1137
1138
1139 class XAttrUnavailableError(YoutubeDLError):
1140 pass
1141
1142
1143 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1144 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1145 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1146 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1147 self._params = params
1148
1149 def https_open(self, req):
1150 kwargs = {}
1151 conn_class = self._https_conn_class
1152
1153 if hasattr(self, '_context'): # python > 2.6
1154 kwargs['context'] = self._context
1155 if hasattr(self, '_check_hostname'): # python 3.x
1156 kwargs['check_hostname'] = self._check_hostname
1157
1158 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1159 if socks_proxy:
1160 from ..networking._urllib import make_socks_conn_class
1161 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1162 del req.headers['Ytdl-socks-proxy']
1163
1164 from ..networking._urllib import _create_http_connection
1165 try:
1166 return self.do_open(
1167 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1168 except urllib.error.URLError as e:
1169 if (isinstance(e.reason, ssl.SSLError)
1170 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1171 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1172 raise
1173
1174
1175 def is_path_like(f):
1176 return isinstance(f, (str, bytes, os.PathLike))
1177
1178
1179 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1180 def __init__(self, cookiejar=None):
1181 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1182
1183 def http_response(self, request, response):
1184 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1185
1186 https_request = urllib.request.HTTPCookieProcessor.http_request
1187 https_response = http_response
1188
1189
1190 def extract_timezone(date_str):
1191 m = re.search(
1192 r'''(?x)
1193 ^.{8,}? # >=8 char non-TZ prefix, if present
1194 (?P<tz>Z| # just the UTC Z, or
1195 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1196 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1197 [ ]? # optional space
1198 (?P<sign>\+|-) # +/-
1199 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1200 $)
1201 ''', date_str)
1202 if not m:
1203 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1204 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1205 if timezone is not None:
1206 date_str = date_str[:-len(m.group('tz'))]
1207 timezone = datetime.timedelta(hours=timezone or 0)
1208 else:
1209 date_str = date_str[:-len(m.group('tz'))]
1210 if not m.group('sign'):
1211 timezone = datetime.timedelta()
1212 else:
1213 sign = 1 if m.group('sign') == '+' else -1
1214 timezone = datetime.timedelta(
1215 hours=sign * int(m.group('hours')),
1216 minutes=sign * int(m.group('minutes')))
1217 return timezone, date_str
1218
1219
1220 def parse_iso8601(date_str, delimiter='T', timezone=None):
1221 """ Return a UNIX timestamp from the given date """
1222
1223 if date_str is None:
1224 return None
1225
1226 date_str = re.sub(r'\.[0-9]+', '', date_str)
1227
1228 if timezone is None:
1229 timezone, date_str = extract_timezone(date_str)
1230
1231 with contextlib.suppress(ValueError):
1232 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1233 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1234 return calendar.timegm(dt.timetuple())
1235
1236
1237 def date_formats(day_first=True):
1238 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1239
1240
1241 def unified_strdate(date_str, day_first=True):
1242 """Return a string with the date in the format YYYYMMDD"""
1243
1244 if date_str is None:
1245 return None
1246 upload_date = None
1247 # Replace commas
1248 date_str = date_str.replace(',', ' ')
1249 # Remove AM/PM + timezone
1250 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1251 _, date_str = extract_timezone(date_str)
1252
1253 for expression in date_formats(day_first):
1254 with contextlib.suppress(ValueError):
1255 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1256 if upload_date is None:
1257 timetuple = email.utils.parsedate_tz(date_str)
1258 if timetuple:
1259 with contextlib.suppress(ValueError):
1260 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1261 if upload_date is not None:
1262 return str(upload_date)
1263
1264
1265 def unified_timestamp(date_str, day_first=True):
1266 if not isinstance(date_str, str):
1267 return None
1268
1269 date_str = re.sub(r'\s+', ' ', re.sub(
1270 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1271
1272 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1273 timezone, date_str = extract_timezone(date_str)
1274
1275 # Remove AM/PM + timezone
1276 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1277
1278 # Remove unrecognized timezones from ISO 8601 alike timestamps
1279 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1280 if m:
1281 date_str = date_str[:-len(m.group('tz'))]
1282
1283 # Python only supports microseconds, so remove nanoseconds
1284 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1285 if m:
1286 date_str = m.group(1)
1287
1288 for expression in date_formats(day_first):
1289 with contextlib.suppress(ValueError):
1290 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1291 return calendar.timegm(dt.timetuple())
1292
1293 timetuple = email.utils.parsedate_tz(date_str)
1294 if timetuple:
1295 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1296
1297
1298 def determine_ext(url, default_ext='unknown_video'):
1299 if url is None or '.' not in url:
1300 return default_ext
1301 guess = url.partition('?')[0].rpartition('.')[2]
1302 if re.match(r'^[A-Za-z0-9]+$', guess):
1303 return guess
1304 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1305 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1306 return guess.rstrip('/')
1307 else:
1308 return default_ext
1309
1310
1311 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1312 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1313
1314
1315 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1316 R"""
1317 Return a datetime object from a string.
1318 Supported format:
1319 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1320
1321 @param format strftime format of DATE
1322 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1323 auto: round to the unit provided in date_str (if applicable).
1324 """
1325 auto_precision = False
1326 if precision == 'auto':
1327 auto_precision = True
1328 precision = 'microsecond'
1329 today = datetime_round(datetime.datetime.utcnow(), precision)
1330 if date_str in ('now', 'today'):
1331 return today
1332 if date_str == 'yesterday':
1333 return today - datetime.timedelta(days=1)
1334 match = re.match(
1335 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1336 date_str)
1337 if match is not None:
1338 start_time = datetime_from_str(match.group('start'), precision, format)
1339 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1340 unit = match.group('unit')
1341 if unit == 'month' or unit == 'year':
1342 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1343 unit = 'day'
1344 else:
1345 if unit == 'week':
1346 unit = 'day'
1347 time *= 7
1348 delta = datetime.timedelta(**{unit + 's': time})
1349 new_date = start_time + delta
1350 if auto_precision:
1351 return datetime_round(new_date, unit)
1352 return new_date
1353
1354 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1355
1356
1357 def date_from_str(date_str, format='%Y%m%d', strict=False):
1358 R"""
1359 Return a date object from a string using datetime_from_str
1360
1361 @param strict Restrict allowed patterns to "YYYYMMDD" and
1362 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1363 """
1364 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1365 raise ValueError(f'Invalid date format "{date_str}"')
1366 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1367
1368
1369 def datetime_add_months(dt, months):
1370 """Increment/Decrement a datetime object by months."""
1371 month = dt.month + months - 1
1372 year = dt.year + month // 12
1373 month = month % 12 + 1
1374 day = min(dt.day, calendar.monthrange(year, month)[1])
1375 return dt.replace(year, month, day)
1376
1377
1378 def datetime_round(dt, precision='day'):
1379 """
1380 Round a datetime object's time to a specific precision
1381 """
1382 if precision == 'microsecond':
1383 return dt
1384
1385 unit_seconds = {
1386 'day': 86400,
1387 'hour': 3600,
1388 'minute': 60,
1389 'second': 1,
1390 }
1391 roundto = lambda x, n: ((x + n / 2) // n) * n
1392 timestamp = calendar.timegm(dt.timetuple())
1393 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1394
1395
1396 def hyphenate_date(date_str):
1397 """
1398 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1399 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1400 if match is not None:
1401 return '-'.join(match.groups())
1402 else:
1403 return date_str
1404
1405
1406 class DateRange:
1407 """Represents a time interval between two dates"""
1408
1409 def __init__(self, start=None, end=None):
1410 """start and end must be strings in the format accepted by date"""
1411 if start is not None:
1412 self.start = date_from_str(start, strict=True)
1413 else:
1414 self.start = datetime.datetime.min.date()
1415 if end is not None:
1416 self.end = date_from_str(end, strict=True)
1417 else:
1418 self.end = datetime.datetime.max.date()
1419 if self.start > self.end:
1420 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1421
1422 @classmethod
1423 def day(cls, day):
1424 """Returns a range that only contains the given day"""
1425 return cls(day, day)
1426
1427 def __contains__(self, date):
1428 """Check if the date is in the range"""
1429 if not isinstance(date, datetime.date):
1430 date = date_from_str(date)
1431 return self.start <= date <= self.end
1432
1433 def __repr__(self):
1434 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1435
1436 def __eq__(self, other):
1437 return (isinstance(other, DateRange)
1438 and self.start == other.start and self.end == other.end)
1439
1440
1441 @functools.cache
1442 def system_identifier():
1443 python_implementation = platform.python_implementation()
1444 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1445 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1446 libc_ver = []
1447 with contextlib.suppress(OSError): # We may not have access to the executable
1448 libc_ver = platform.libc_ver()
1449
1450 return 'Python %s (%s %s %s) - %s (%s%s)' % (
1451 platform.python_version(),
1452 python_implementation,
1453 platform.machine(),
1454 platform.architecture()[0],
1455 platform.platform(),
1456 ssl.OPENSSL_VERSION,
1457 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1458 )
1459
1460
1461 @functools.cache
1462 def get_windows_version():
1463 ''' Get Windows version. returns () if it's not running on Windows '''
1464 if compat_os_name == 'nt':
1465 return version_tuple(platform.win32_ver()[1])
1466 else:
1467 return ()
1468
1469
1470 def write_string(s, out=None, encoding=None):
1471 assert isinstance(s, str)
1472 out = out or sys.stderr
1473 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1474 if not out:
1475 return
1476
1477 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1478 s = re.sub(r'([\r\n]+)', r' \1', s)
1479
1480 enc, buffer = None, out
1481 if 'b' in getattr(out, 'mode', ''):
1482 enc = encoding or preferredencoding()
1483 elif hasattr(out, 'buffer'):
1484 buffer = out.buffer
1485 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1486
1487 buffer.write(s.encode(enc, 'ignore') if enc else s)
1488 out.flush()
1489
1490
1491 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1492 from .. import _IN_CLI
1493 if _IN_CLI:
1494 if msg in deprecation_warning._cache:
1495 return
1496 deprecation_warning._cache.add(msg)
1497 if printer:
1498 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1499 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1500 else:
1501 import warnings
1502 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1503
1504
1505 deprecation_warning._cache = set()
1506
1507
1508 def bytes_to_intlist(bs):
1509 if not bs:
1510 return []
1511 if isinstance(bs[0], int): # Python 3
1512 return list(bs)
1513 else:
1514 return [ord(c) for c in bs]
1515
1516
1517 def intlist_to_bytes(xs):
1518 if not xs:
1519 return b''
1520 return struct.pack('%dB' % len(xs), *xs)
1521
1522
1523 class LockingUnsupportedError(OSError):
1524 msg = 'File locking is not supported'
1525
1526 def __init__(self):
1527 super().__init__(self.msg)
1528
1529
1530 # Cross-platform file locking
1531 if sys.platform == 'win32':
1532 import ctypes
1533 import ctypes.wintypes
1534 import msvcrt
1535
1536 class OVERLAPPED(ctypes.Structure):
1537 _fields_ = [
1538 ('Internal', ctypes.wintypes.LPVOID),
1539 ('InternalHigh', ctypes.wintypes.LPVOID),
1540 ('Offset', ctypes.wintypes.DWORD),
1541 ('OffsetHigh', ctypes.wintypes.DWORD),
1542 ('hEvent', ctypes.wintypes.HANDLE),
1543 ]
1544
1545 kernel32 = ctypes.WinDLL('kernel32')
1546 LockFileEx = kernel32.LockFileEx
1547 LockFileEx.argtypes = [
1548 ctypes.wintypes.HANDLE, # hFile
1549 ctypes.wintypes.DWORD, # dwFlags
1550 ctypes.wintypes.DWORD, # dwReserved
1551 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1552 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1553 ctypes.POINTER(OVERLAPPED) # Overlapped
1554 ]
1555 LockFileEx.restype = ctypes.wintypes.BOOL
1556 UnlockFileEx = kernel32.UnlockFileEx
1557 UnlockFileEx.argtypes = [
1558 ctypes.wintypes.HANDLE, # hFile
1559 ctypes.wintypes.DWORD, # dwReserved
1560 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1561 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1562 ctypes.POINTER(OVERLAPPED) # Overlapped
1563 ]
1564 UnlockFileEx.restype = ctypes.wintypes.BOOL
1565 whole_low = 0xffffffff
1566 whole_high = 0x7fffffff
1567
1568 def _lock_file(f, exclusive, block):
1569 overlapped = OVERLAPPED()
1570 overlapped.Offset = 0
1571 overlapped.OffsetHigh = 0
1572 overlapped.hEvent = 0
1573 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1574
1575 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1576 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1577 0, whole_low, whole_high, f._lock_file_overlapped_p):
1578 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1579 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1580
1581 def _unlock_file(f):
1582 assert f._lock_file_overlapped_p
1583 handle = msvcrt.get_osfhandle(f.fileno())
1584 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1585 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1586
1587 else:
1588 try:
1589 import fcntl
1590
1591 def _lock_file(f, exclusive, block):
1592 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1593 if not block:
1594 flags |= fcntl.LOCK_NB
1595 try:
1596 fcntl.flock(f, flags)
1597 except BlockingIOError:
1598 raise
1599 except OSError: # AOSP does not have flock()
1600 fcntl.lockf(f, flags)
1601
1602 def _unlock_file(f):
1603 with contextlib.suppress(OSError):
1604 return fcntl.flock(f, fcntl.LOCK_UN)
1605 with contextlib.suppress(OSError):
1606 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1607 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
1608
1609 except ImportError:
1610
1611 def _lock_file(f, exclusive, block):
1612 raise LockingUnsupportedError()
1613
1614 def _unlock_file(f):
1615 raise LockingUnsupportedError()
1616
1617
1618 class locked_file:
1619 locked = False
1620
1621 def __init__(self, filename, mode, block=True, encoding=None):
1622 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1623 raise NotImplementedError(mode)
1624 self.mode, self.block = mode, block
1625
1626 writable = any(f in mode for f in 'wax+')
1627 readable = any(f in mode for f in 'r+')
1628 flags = functools.reduce(operator.ior, (
1629 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1630 getattr(os, 'O_BINARY', 0), # Windows only
1631 getattr(os, 'O_NOINHERIT', 0), # Windows only
1632 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1633 os.O_APPEND if 'a' in mode else 0,
1634 os.O_EXCL if 'x' in mode else 0,
1635 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1636 ))
1637
1638 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1639
1640 def __enter__(self):
1641 exclusive = 'r' not in self.mode
1642 try:
1643 _lock_file(self.f, exclusive, self.block)
1644 self.locked = True
1645 except OSError:
1646 self.f.close()
1647 raise
1648 if 'w' in self.mode:
1649 try:
1650 self.f.truncate()
1651 except OSError as e:
1652 if e.errno not in (
1653 errno.ESPIPE, # Illegal seek - expected for FIFO
1654 errno.EINVAL, # Invalid argument - expected for /dev/null
1655 ):
1656 raise
1657 return self
1658
1659 def unlock(self):
1660 if not self.locked:
1661 return
1662 try:
1663 _unlock_file(self.f)
1664 finally:
1665 self.locked = False
1666
1667 def __exit__(self, *_):
1668 try:
1669 self.unlock()
1670 finally:
1671 self.f.close()
1672
1673 open = __enter__
1674 close = __exit__
1675
1676 def __getattr__(self, attr):
1677 return getattr(self.f, attr)
1678
1679 def __iter__(self):
1680 return iter(self.f)
1681
1682
1683 @functools.cache
1684 def get_filesystem_encoding():
1685 encoding = sys.getfilesystemencoding()
1686 return encoding if encoding is not None else 'utf-8'
1687
1688
1689 def shell_quote(args):
1690 quoted_args = []
1691 encoding = get_filesystem_encoding()
1692 for a in args:
1693 if isinstance(a, bytes):
1694 # We may get a filename encoded with 'encodeFilename'
1695 a = a.decode(encoding)
1696 quoted_args.append(compat_shlex_quote(a))
1697 return ' '.join(quoted_args)
1698
1699
1700 def smuggle_url(url, data):
1701 """ Pass additional data in a URL for internal use. """
1702
1703 url, idata = unsmuggle_url(url, {})
1704 data.update(idata)
1705 sdata = urllib.parse.urlencode(
1706 {'__youtubedl_smuggle': json.dumps(data)})
1707 return url + '#' + sdata
1708
1709
1710 def unsmuggle_url(smug_url, default=None):
1711 if '#__youtubedl_smuggle' not in smug_url:
1712 return smug_url, default
1713 url, _, sdata = smug_url.rpartition('#')
1714 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1715 data = json.loads(jsond)
1716 return url, data
1717
1718
1719 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1720 """ Formats numbers with decimal sufixes like K, M, etc """
1721 num, factor = float_or_none(num), float(factor)
1722 if num is None or num < 0:
1723 return None
1724 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1725 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1726 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1727 if factor == 1024:
1728 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1729 converted = num / (factor ** exponent)
1730 return fmt % (converted, suffix)
1731
1732
1733 def format_bytes(bytes):
1734 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1735
1736
1737 def lookup_unit_table(unit_table, s, strict=False):
1738 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1739 units_re = '|'.join(re.escape(u) for u in unit_table)
1740 m = (re.fullmatch if strict else re.match)(
1741 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1742 if not m:
1743 return None
1744
1745 num = float(m.group('num').replace(',', '.'))
1746 mult = unit_table[m.group('unit')]
1747 return round(num * mult)
1748
1749
1750 def parse_bytes(s):
1751 """Parse a string indicating a byte quantity into an integer"""
1752 return lookup_unit_table(
1753 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1754 s.upper(), strict=True)
1755
1756
1757 def parse_filesize(s):
1758 if s is None:
1759 return None
1760
1761 # The lower-case forms are of course incorrect and unofficial,
1762 # but we support those too
1763 _UNIT_TABLE = {
1764 'B': 1,
1765 'b': 1,
1766 'bytes': 1,
1767 'KiB': 1024,
1768 'KB': 1000,
1769 'kB': 1024,
1770 'Kb': 1000,
1771 'kb': 1000,
1772 'kilobytes': 1000,
1773 'kibibytes': 1024,
1774 'MiB': 1024 ** 2,
1775 'MB': 1000 ** 2,
1776 'mB': 1024 ** 2,
1777 'Mb': 1000 ** 2,
1778 'mb': 1000 ** 2,
1779 'megabytes': 1000 ** 2,
1780 'mebibytes': 1024 ** 2,
1781 'GiB': 1024 ** 3,
1782 'GB': 1000 ** 3,
1783 'gB': 1024 ** 3,
1784 'Gb': 1000 ** 3,
1785 'gb': 1000 ** 3,
1786 'gigabytes': 1000 ** 3,
1787 'gibibytes': 1024 ** 3,
1788 'TiB': 1024 ** 4,
1789 'TB': 1000 ** 4,
1790 'tB': 1024 ** 4,
1791 'Tb': 1000 ** 4,
1792 'tb': 1000 ** 4,
1793 'terabytes': 1000 ** 4,
1794 'tebibytes': 1024 ** 4,
1795 'PiB': 1024 ** 5,
1796 'PB': 1000 ** 5,
1797 'pB': 1024 ** 5,
1798 'Pb': 1000 ** 5,
1799 'pb': 1000 ** 5,
1800 'petabytes': 1000 ** 5,
1801 'pebibytes': 1024 ** 5,
1802 'EiB': 1024 ** 6,
1803 'EB': 1000 ** 6,
1804 'eB': 1024 ** 6,
1805 'Eb': 1000 ** 6,
1806 'eb': 1000 ** 6,
1807 'exabytes': 1000 ** 6,
1808 'exbibytes': 1024 ** 6,
1809 'ZiB': 1024 ** 7,
1810 'ZB': 1000 ** 7,
1811 'zB': 1024 ** 7,
1812 'Zb': 1000 ** 7,
1813 'zb': 1000 ** 7,
1814 'zettabytes': 1000 ** 7,
1815 'zebibytes': 1024 ** 7,
1816 'YiB': 1024 ** 8,
1817 'YB': 1000 ** 8,
1818 'yB': 1024 ** 8,
1819 'Yb': 1000 ** 8,
1820 'yb': 1000 ** 8,
1821 'yottabytes': 1000 ** 8,
1822 'yobibytes': 1024 ** 8,
1823 }
1824
1825 return lookup_unit_table(_UNIT_TABLE, s)
1826
1827
1828 def parse_count(s):
1829 if s is None:
1830 return None
1831
1832 s = re.sub(r'^[^\d]+\s', '', s).strip()
1833
1834 if re.match(r'^[\d,.]+$', s):
1835 return str_to_int(s)
1836
1837 _UNIT_TABLE = {
1838 'k': 1000,
1839 'K': 1000,
1840 'm': 1000 ** 2,
1841 'M': 1000 ** 2,
1842 'kk': 1000 ** 2,
1843 'KK': 1000 ** 2,
1844 'b': 1000 ** 3,
1845 'B': 1000 ** 3,
1846 }
1847
1848 ret = lookup_unit_table(_UNIT_TABLE, s)
1849 if ret is not None:
1850 return ret
1851
1852 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1853 if mobj:
1854 return str_to_int(mobj.group(1))
1855
1856
1857 def parse_resolution(s, *, lenient=False):
1858 if s is None:
1859 return {}
1860
1861 if lenient:
1862 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1863 else:
1864 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1865 if mobj:
1866 return {
1867 'width': int(mobj.group('w')),
1868 'height': int(mobj.group('h')),
1869 }
1870
1871 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1872 if mobj:
1873 return {'height': int(mobj.group(1))}
1874
1875 mobj = re.search(r'\b([48])[kK]\b', s)
1876 if mobj:
1877 return {'height': int(mobj.group(1)) * 540}
1878
1879 return {}
1880
1881
1882 def parse_bitrate(s):
1883 if not isinstance(s, str):
1884 return
1885 mobj = re.search(r'\b(\d+)\s*kbps', s)
1886 if mobj:
1887 return int(mobj.group(1))
1888
1889
1890 def month_by_name(name, lang='en'):
1891 """ Return the number of a month by (locale-independently) English name """
1892
1893 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1894
1895 try:
1896 return month_names.index(name) + 1
1897 except ValueError:
1898 return None
1899
1900
1901 def month_by_abbreviation(abbrev):
1902 """ Return the number of a month by (locale-independently) English
1903 abbreviations """
1904
1905 try:
1906 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1907 except ValueError:
1908 return None
1909
1910
1911 def fix_xml_ampersands(xml_str):
1912 """Replace all the '&' by '&amp;' in XML"""
1913 return re.sub(
1914 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1915 '&amp;',
1916 xml_str)
1917
1918
1919 def setproctitle(title):
1920 assert isinstance(title, str)
1921
1922 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1923 try:
1924 import ctypes
1925 except ImportError:
1926 return
1927
1928 try:
1929 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1930 except OSError:
1931 return
1932 except TypeError:
1933 # LoadLibrary in Windows Python 2.7.13 only expects
1934 # a bytestring, but since unicode_literals turns
1935 # every string into a unicode string, it fails.
1936 return
1937 title_bytes = title.encode()
1938 buf = ctypes.create_string_buffer(len(title_bytes))
1939 buf.value = title_bytes
1940 try:
1941 libc.prctl(15, buf, 0, 0, 0)
1942 except AttributeError:
1943 return # Strange libc, just skip this
1944
1945
1946 def remove_start(s, start):
1947 return s[len(start):] if s is not None and s.startswith(start) else s
1948
1949
1950 def remove_end(s, end):
1951 return s[:-len(end)] if s is not None and s.endswith(end) else s
1952
1953
1954 def remove_quotes(s):
1955 if s is None or len(s) < 2:
1956 return s
1957 for quote in ('"', "'", ):
1958 if s[0] == quote and s[-1] == quote:
1959 return s[1:-1]
1960 return s
1961
1962
1963 def get_domain(url):
1964 """
1965 This implementation is inconsistent, but is kept for compatibility.
1966 Use this only for "webpage_url_domain"
1967 """
1968 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1969
1970
1971 def url_basename(url):
1972 path = urllib.parse.urlparse(url).path
1973 return path.strip('/').split('/')[-1]
1974
1975
1976 def base_url(url):
1977 return re.match(r'https?://[^?#]+/', url).group()
1978
1979
1980 def urljoin(base, path):
1981 if isinstance(path, bytes):
1982 path = path.decode()
1983 if not isinstance(path, str) or not path:
1984 return None
1985 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1986 return path
1987 if isinstance(base, bytes):
1988 base = base.decode()
1989 if not isinstance(base, str) or not re.match(
1990 r'^(?:https?:)?//', base):
1991 return None
1992 return urllib.parse.urljoin(base, path)
1993
1994
1995 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1996 if get_attr and v is not None:
1997 v = getattr(v, get_attr, None)
1998 try:
1999 return int(v) * invscale // scale
2000 except (ValueError, TypeError, OverflowError):
2001 return default
2002
2003
2004 def str_or_none(v, default=None):
2005 return default if v is None else str(v)
2006
2007
2008 def str_to_int(int_str):
2009 """ A more relaxed version of int_or_none """
2010 if isinstance(int_str, int):
2011 return int_str
2012 elif isinstance(int_str, str):
2013 int_str = re.sub(r'[,\.\+]', '', int_str)
2014 return int_or_none(int_str)
2015
2016
2017 def float_or_none(v, scale=1, invscale=1, default=None):
2018 if v is None:
2019 return default
2020 try:
2021 return float(v) * invscale / scale
2022 except (ValueError, TypeError):
2023 return default
2024
2025
2026 def bool_or_none(v, default=None):
2027 return v if isinstance(v, bool) else default
2028
2029
2030 def strip_or_none(v, default=None):
2031 return v.strip() if isinstance(v, str) else default
2032
2033
2034 def url_or_none(url):
2035 if not url or not isinstance(url, str):
2036 return None
2037 url = url.strip()
2038 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2039
2040
2041 def request_to_url(req):
2042 if isinstance(req, urllib.request.Request):
2043 return req.get_full_url()
2044 else:
2045 return req
2046
2047
2048 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2049 datetime_object = None
2050 try:
2051 if isinstance(timestamp, (int, float)): # unix timestamp
2052 # Using naive datetime here can break timestamp() in Windows
2053 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2054 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2055 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2056 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2057 + datetime.timedelta(seconds=timestamp))
2058 elif isinstance(timestamp, str): # assume YYYYMMDD
2059 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2060 date_format = re.sub( # Support %s on windows
2061 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2062 return datetime_object.strftime(date_format)
2063 except (ValueError, TypeError, AttributeError):
2064 return default
2065
2066
2067 def parse_duration(s):
2068 if not isinstance(s, str):
2069 return None
2070 s = s.strip()
2071 if not s:
2072 return None
2073
2074 days, hours, mins, secs, ms = [None] * 5
2075 m = re.match(r'''(?x)
2076 (?P<before_secs>
2077 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2078 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2079 (?P<ms>[.:][0-9]+)?Z?$
2080 ''', s)
2081 if m:
2082 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2083 else:
2084 m = re.match(
2085 r'''(?ix)(?:P?
2086 (?:
2087 [0-9]+\s*y(?:ears?)?,?\s*
2088 )?
2089 (?:
2090 [0-9]+\s*m(?:onths?)?,?\s*
2091 )?
2092 (?:
2093 [0-9]+\s*w(?:eeks?)?,?\s*
2094 )?
2095 (?:
2096 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2097 )?
2098 T)?
2099 (?:
2100 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2101 )?
2102 (?:
2103 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2104 )?
2105 (?:
2106 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2107 )?Z?$''', s)
2108 if m:
2109 days, hours, mins, secs, ms = m.groups()
2110 else:
2111 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2112 if m:
2113 hours, mins = m.groups()
2114 else:
2115 return None
2116
2117 if ms:
2118 ms = ms.replace(':', '.')
2119 return sum(float(part or 0) * mult for part, mult in (
2120 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2121
2122
2123 def prepend_extension(filename, ext, expected_real_ext=None):
2124 name, real_ext = os.path.splitext(filename)
2125 return (
2126 f'{name}.{ext}{real_ext}'
2127 if not expected_real_ext or real_ext[1:] == expected_real_ext
2128 else f'{filename}.{ext}')
2129
2130
2131 def replace_extension(filename, ext, expected_real_ext=None):
2132 name, real_ext = os.path.splitext(filename)
2133 return '{}.{}'.format(
2134 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2135 ext)
2136
2137
2138 def check_executable(exe, args=[]):
2139 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2140 args can be a list of arguments for a short output (like -version) """
2141 try:
2142 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2143 except OSError:
2144 return False
2145 return exe
2146
2147
2148 def _get_exe_version_output(exe, args):
2149 try:
2150 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2151 # SIGTTOU if yt-dlp is run in the background.
2152 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2153 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2154 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2155 if ret:
2156 return None
2157 except OSError:
2158 return False
2159 return stdout
2160
2161
2162 def detect_exe_version(output, version_re=None, unrecognized='present'):
2163 assert isinstance(output, str)
2164 if version_re is None:
2165 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2166 m = re.search(version_re, output)
2167 if m:
2168 return m.group(1)
2169 else:
2170 return unrecognized
2171
2172
2173 def get_exe_version(exe, args=['--version'],
2174 version_re=None, unrecognized=('present', 'broken')):
2175 """ Returns the version of the specified executable,
2176 or False if the executable is not present """
2177 unrecognized = variadic(unrecognized)
2178 assert len(unrecognized) in (1, 2)
2179 out = _get_exe_version_output(exe, args)
2180 if out is None:
2181 return unrecognized[-1]
2182 return out and detect_exe_version(out, version_re, unrecognized[0])
2183
2184
2185 def frange(start=0, stop=None, step=1):
2186 """Float range"""
2187 if stop is None:
2188 start, stop = 0, start
2189 sign = [-1, 1][step > 0] if step else 0
2190 while sign * start < sign * stop:
2191 yield start
2192 start += step
2193
2194
2195 class LazyList(collections.abc.Sequence):
2196 """Lazy immutable list from an iterable
2197 Note that slices of a LazyList are lists and not LazyList"""
2198
2199 class IndexError(IndexError):
2200 pass
2201
2202 def __init__(self, iterable, *, reverse=False, _cache=None):
2203 self._iterable = iter(iterable)
2204 self._cache = [] if _cache is None else _cache
2205 self._reversed = reverse
2206
2207 def __iter__(self):
2208 if self._reversed:
2209 # We need to consume the entire iterable to iterate in reverse
2210 yield from self.exhaust()
2211 return
2212 yield from self._cache
2213 for item in self._iterable:
2214 self._cache.append(item)
2215 yield item
2216
2217 def _exhaust(self):
2218 self._cache.extend(self._iterable)
2219 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2220 return self._cache
2221
2222 def exhaust(self):
2223 """Evaluate the entire iterable"""
2224 return self._exhaust()[::-1 if self._reversed else 1]
2225
2226 @staticmethod
2227 def _reverse_index(x):
2228 return None if x is None else ~x
2229
2230 def __getitem__(self, idx):
2231 if isinstance(idx, slice):
2232 if self._reversed:
2233 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2234 start, stop, step = idx.start, idx.stop, idx.step or 1
2235 elif isinstance(idx, int):
2236 if self._reversed:
2237 idx = self._reverse_index(idx)
2238 start, stop, step = idx, idx, 0
2239 else:
2240 raise TypeError('indices must be integers or slices')
2241 if ((start or 0) < 0 or (stop or 0) < 0
2242 or (start is None and step < 0)
2243 or (stop is None and step > 0)):
2244 # We need to consume the entire iterable to be able to slice from the end
2245 # Obviously, never use this with infinite iterables
2246 self._exhaust()
2247 try:
2248 return self._cache[idx]
2249 except IndexError as e:
2250 raise self.IndexError(e) from e
2251 n = max(start or 0, stop or 0) - len(self._cache) + 1
2252 if n > 0:
2253 self._cache.extend(itertools.islice(self._iterable, n))
2254 try:
2255 return self._cache[idx]
2256 except IndexError as e:
2257 raise self.IndexError(e) from e
2258
2259 def __bool__(self):
2260 try:
2261 self[-1] if self._reversed else self[0]
2262 except self.IndexError:
2263 return False
2264 return True
2265
2266 def __len__(self):
2267 self._exhaust()
2268 return len(self._cache)
2269
2270 def __reversed__(self):
2271 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2272
2273 def __copy__(self):
2274 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2275
2276 def __repr__(self):
2277 # repr and str should mimic a list. So we exhaust the iterable
2278 return repr(self.exhaust())
2279
2280 def __str__(self):
2281 return repr(self.exhaust())
2282
2283
2284 class PagedList:
2285
2286 class IndexError(IndexError):
2287 pass
2288
2289 def __len__(self):
2290 # This is only useful for tests
2291 return len(self.getslice())
2292
2293 def __init__(self, pagefunc, pagesize, use_cache=True):
2294 self._pagefunc = pagefunc
2295 self._pagesize = pagesize
2296 self._pagecount = float('inf')
2297 self._use_cache = use_cache
2298 self._cache = {}
2299
2300 def getpage(self, pagenum):
2301 page_results = self._cache.get(pagenum)
2302 if page_results is None:
2303 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2304 if self._use_cache:
2305 self._cache[pagenum] = page_results
2306 return page_results
2307
2308 def getslice(self, start=0, end=None):
2309 return list(self._getslice(start, end))
2310
2311 def _getslice(self, start, end):
2312 raise NotImplementedError('This method must be implemented by subclasses')
2313
2314 def __getitem__(self, idx):
2315 assert self._use_cache, 'Indexing PagedList requires cache'
2316 if not isinstance(idx, int) or idx < 0:
2317 raise TypeError('indices must be non-negative integers')
2318 entries = self.getslice(idx, idx + 1)
2319 if not entries:
2320 raise self.IndexError()
2321 return entries[0]
2322
2323
2324 class OnDemandPagedList(PagedList):
2325 """Download pages until a page with less than maximum results"""
2326
2327 def _getslice(self, start, end):
2328 for pagenum in itertools.count(start // self._pagesize):
2329 firstid = pagenum * self._pagesize
2330 nextfirstid = pagenum * self._pagesize + self._pagesize
2331 if start >= nextfirstid:
2332 continue
2333
2334 startv = (
2335 start % self._pagesize
2336 if firstid <= start < nextfirstid
2337 else 0)
2338 endv = (
2339 ((end - 1) % self._pagesize) + 1
2340 if (end is not None and firstid <= end <= nextfirstid)
2341 else None)
2342
2343 try:
2344 page_results = self.getpage(pagenum)
2345 except Exception:
2346 self._pagecount = pagenum - 1
2347 raise
2348 if startv != 0 or endv is not None:
2349 page_results = page_results[startv:endv]
2350 yield from page_results
2351
2352 # A little optimization - if current page is not "full", ie. does
2353 # not contain page_size videos then we can assume that this page
2354 # is the last one - there are no more ids on further pages -
2355 # i.e. no need to query again.
2356 if len(page_results) + startv < self._pagesize:
2357 break
2358
2359 # If we got the whole page, but the next page is not interesting,
2360 # break out early as well
2361 if end == nextfirstid:
2362 break
2363
2364
2365 class InAdvancePagedList(PagedList):
2366 """PagedList with total number of pages known in advance"""
2367
2368 def __init__(self, pagefunc, pagecount, pagesize):
2369 PagedList.__init__(self, pagefunc, pagesize, True)
2370 self._pagecount = pagecount
2371
2372 def _getslice(self, start, end):
2373 start_page = start // self._pagesize
2374 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2375 skip_elems = start - start_page * self._pagesize
2376 only_more = None if end is None else end - start
2377 for pagenum in range(start_page, end_page):
2378 page_results = self.getpage(pagenum)
2379 if skip_elems:
2380 page_results = page_results[skip_elems:]
2381 skip_elems = None
2382 if only_more is not None:
2383 if len(page_results) < only_more:
2384 only_more -= len(page_results)
2385 else:
2386 yield from page_results[:only_more]
2387 break
2388 yield from page_results
2389
2390
2391 class PlaylistEntries:
2392 MissingEntry = object()
2393 is_exhausted = False
2394
2395 def __init__(self, ydl, info_dict):
2396 self.ydl = ydl
2397
2398 # _entries must be assigned now since infodict can change during iteration
2399 entries = info_dict.get('entries')
2400 if entries is None:
2401 raise EntryNotInPlaylist('There are no entries')
2402 elif isinstance(entries, list):
2403 self.is_exhausted = True
2404
2405 requested_entries = info_dict.get('requested_entries')
2406 self.is_incomplete = requested_entries is not None
2407 if self.is_incomplete:
2408 assert self.is_exhausted
2409 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2410 for i, entry in zip(requested_entries, entries):
2411 self._entries[i - 1] = entry
2412 elif isinstance(entries, (list, PagedList, LazyList)):
2413 self._entries = entries
2414 else:
2415 self._entries = LazyList(entries)
2416
2417 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2418 (?P<start>[+-]?\d+)?
2419 (?P<range>[:-]
2420 (?P<end>[+-]?\d+|inf(?:inite)?)?
2421 (?::(?P<step>[+-]?\d+))?
2422 )?''')
2423
2424 @classmethod
2425 def parse_playlist_items(cls, string):
2426 for segment in string.split(','):
2427 if not segment:
2428 raise ValueError('There is two or more consecutive commas')
2429 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2430 if not mobj:
2431 raise ValueError(f'{segment!r} is not a valid specification')
2432 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2433 if int_or_none(step) == 0:
2434 raise ValueError(f'Step in {segment!r} cannot be zero')
2435 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2436
2437 def get_requested_items(self):
2438 playlist_items = self.ydl.params.get('playlist_items')
2439 playlist_start = self.ydl.params.get('playliststart', 1)
2440 playlist_end = self.ydl.params.get('playlistend')
2441 # For backwards compatibility, interpret -1 as whole list
2442 if playlist_end in (-1, None):
2443 playlist_end = ''
2444 if not playlist_items:
2445 playlist_items = f'{playlist_start}:{playlist_end}'
2446 elif playlist_start != 1 or playlist_end:
2447 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2448
2449 for index in self.parse_playlist_items(playlist_items):
2450 for i, entry in self[index]:
2451 yield i, entry
2452 if not entry:
2453 continue
2454 try:
2455 # The item may have just been added to archive. Don't break due to it
2456 if not self.ydl.params.get('lazy_playlist'):
2457 # TODO: Add auto-generated fields
2458 self.ydl._match_entry(entry, incomplete=True, silent=True)
2459 except (ExistingVideoReached, RejectedVideoReached):
2460 return
2461
2462 def get_full_count(self):
2463 if self.is_exhausted and not self.is_incomplete:
2464 return len(self)
2465 elif isinstance(self._entries, InAdvancePagedList):
2466 if self._entries._pagesize == 1:
2467 return self._entries._pagecount
2468
2469 @functools.cached_property
2470 def _getter(self):
2471 if isinstance(self._entries, list):
2472 def get_entry(i):
2473 try:
2474 entry = self._entries[i]
2475 except IndexError:
2476 entry = self.MissingEntry
2477 if not self.is_incomplete:
2478 raise self.IndexError()
2479 if entry is self.MissingEntry:
2480 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2481 return entry
2482 else:
2483 def get_entry(i):
2484 try:
2485 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2486 except (LazyList.IndexError, PagedList.IndexError):
2487 raise self.IndexError()
2488 return get_entry
2489
2490 def __getitem__(self, idx):
2491 if isinstance(idx, int):
2492 idx = slice(idx, idx)
2493
2494 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2495 step = 1 if idx.step is None else idx.step
2496 if idx.start is None:
2497 start = 0 if step > 0 else len(self) - 1
2498 else:
2499 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2500
2501 # NB: Do not call len(self) when idx == [:]
2502 if idx.stop is None:
2503 stop = 0 if step < 0 else float('inf')
2504 else:
2505 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2506 stop += [-1, 1][step > 0]
2507
2508 for i in frange(start, stop, step):
2509 if i < 0:
2510 continue
2511 try:
2512 entry = self._getter(i)
2513 except self.IndexError:
2514 self.is_exhausted = True
2515 if step > 0:
2516 break
2517 continue
2518 yield i + 1, entry
2519
2520 def __len__(self):
2521 return len(tuple(self[:]))
2522
2523 class IndexError(IndexError):
2524 pass
2525
2526
2527 def uppercase_escape(s):
2528 unicode_escape = codecs.getdecoder('unicode_escape')
2529 return re.sub(
2530 r'\\U[0-9a-fA-F]{8}',
2531 lambda m: unicode_escape(m.group(0))[0],
2532 s)
2533
2534
2535 def lowercase_escape(s):
2536 unicode_escape = codecs.getdecoder('unicode_escape')
2537 return re.sub(
2538 r'\\u[0-9a-fA-F]{4}',
2539 lambda m: unicode_escape(m.group(0))[0],
2540 s)
2541
2542
2543 def escape_rfc3986(s):
2544 """Escape non-ASCII characters as suggested by RFC 3986"""
2545 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2546
2547
2548 def escape_url(url):
2549 """Escape URL as suggested by RFC 3986"""
2550 url_parsed = urllib.parse.urlparse(url)
2551 return url_parsed._replace(
2552 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2553 path=escape_rfc3986(url_parsed.path),
2554 params=escape_rfc3986(url_parsed.params),
2555 query=escape_rfc3986(url_parsed.query),
2556 fragment=escape_rfc3986(url_parsed.fragment)
2557 ).geturl()
2558
2559
2560 def parse_qs(url, **kwargs):
2561 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2562
2563
2564 def read_batch_urls(batch_fd):
2565 def fixup(url):
2566 if not isinstance(url, str):
2567 url = url.decode('utf-8', 'replace')
2568 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2569 for bom in BOM_UTF8:
2570 if url.startswith(bom):
2571 url = url[len(bom):]
2572 url = url.lstrip()
2573 if not url or url.startswith(('#', ';', ']')):
2574 return False
2575 # "#" cannot be stripped out since it is part of the URI
2576 # However, it can be safely stripped out if following a whitespace
2577 return re.split(r'\s#', url, 1)[0].rstrip()
2578
2579 with contextlib.closing(batch_fd) as fd:
2580 return [url for url in map(fixup, fd) if url]
2581
2582
2583 def urlencode_postdata(*args, **kargs):
2584 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2585
2586
2587 def update_url(url, *, query_update=None, **kwargs):
2588 """Replace URL components specified by kwargs
2589 @param url str or parse url tuple
2590 @param query_update update query
2591 @returns str
2592 """
2593 if isinstance(url, str):
2594 if not kwargs and not query_update:
2595 return url
2596 else:
2597 url = urllib.parse.urlparse(url)
2598 if query_update:
2599 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2600 kwargs['query'] = urllib.parse.urlencode({
2601 **urllib.parse.parse_qs(url.query),
2602 **query_update
2603 }, True)
2604 return urllib.parse.urlunparse(url._replace(**kwargs))
2605
2606
2607 def update_url_query(url, query):
2608 return update_url(url, query_update=query)
2609
2610
2611 def _multipart_encode_impl(data, boundary):
2612 content_type = 'multipart/form-data; boundary=%s' % boundary
2613
2614 out = b''
2615 for k, v in data.items():
2616 out += b'--' + boundary.encode('ascii') + b'\r\n'
2617 if isinstance(k, str):
2618 k = k.encode()
2619 if isinstance(v, str):
2620 v = v.encode()
2621 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2622 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2623 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2624 if boundary.encode('ascii') in content:
2625 raise ValueError('Boundary overlaps with data')
2626 out += content
2627
2628 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2629
2630 return out, content_type
2631
2632
2633 def multipart_encode(data, boundary=None):
2634 '''
2635 Encode a dict to RFC 7578-compliant form-data
2636
2637 data:
2638 A dict where keys and values can be either Unicode or bytes-like
2639 objects.
2640 boundary:
2641 If specified a Unicode object, it's used as the boundary. Otherwise
2642 a random boundary is generated.
2643
2644 Reference: https://tools.ietf.org/html/rfc7578
2645 '''
2646 has_specified_boundary = boundary is not None
2647
2648 while True:
2649 if boundary is None:
2650 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2651
2652 try:
2653 out, content_type = _multipart_encode_impl(data, boundary)
2654 break
2655 except ValueError:
2656 if has_specified_boundary:
2657 raise
2658 boundary = None
2659
2660 return out, content_type
2661
2662
2663 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2664 if blocked_types is NO_DEFAULT:
2665 blocked_types = (str, bytes, collections.abc.Mapping)
2666 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2667
2668
2669 def variadic(x, allowed_types=NO_DEFAULT):
2670 if not isinstance(allowed_types, (tuple, type)):
2671 deprecation_warning('allowed_types should be a tuple or a type')
2672 allowed_types = tuple(allowed_types)
2673 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2674
2675
2676 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2677 for f in funcs:
2678 try:
2679 val = f(*args, **kwargs)
2680 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2681 pass
2682 else:
2683 if expected_type is None or isinstance(val, expected_type):
2684 return val
2685
2686
2687 def try_get(src, getter, expected_type=None):
2688 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2689
2690
2691 def filter_dict(dct, cndn=lambda _, v: v is not None):
2692 return {k: v for k, v in dct.items() if cndn(k, v)}
2693
2694
2695 def merge_dicts(*dicts):
2696 merged = {}
2697 for a_dict in dicts:
2698 for k, v in a_dict.items():
2699 if (v is not None and k not in merged
2700 or isinstance(v, str) and merged[k] == ''):
2701 merged[k] = v
2702 return merged
2703
2704
2705 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2706 return string if isinstance(string, str) else str(string, encoding, errors)
2707
2708
2709 US_RATINGS = {
2710 'G': 0,
2711 'PG': 10,
2712 'PG-13': 13,
2713 'R': 16,
2714 'NC': 18,
2715 }
2716
2717
2718 TV_PARENTAL_GUIDELINES = {
2719 'TV-Y': 0,
2720 'TV-Y7': 7,
2721 'TV-G': 0,
2722 'TV-PG': 0,
2723 'TV-14': 14,
2724 'TV-MA': 17,
2725 }
2726
2727
2728 def parse_age_limit(s):
2729 # isinstance(False, int) is True. So type() must be used instead
2730 if type(s) is int: # noqa: E721
2731 return s if 0 <= s <= 21 else None
2732 elif not isinstance(s, str):
2733 return None
2734 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2735 if m:
2736 return int(m.group('age'))
2737 s = s.upper()
2738 if s in US_RATINGS:
2739 return US_RATINGS[s]
2740 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2741 if m:
2742 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2743 return None
2744
2745
2746 def strip_jsonp(code):
2747 return re.sub(
2748 r'''(?sx)^
2749 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2750 (?:\s*&&\s*(?P=func_name))?
2751 \s*\(\s*(?P<callback_data>.*)\);?
2752 \s*?(?://[^\n]*)*$''',
2753 r'\g<callback_data>', code)
2754
2755
2756 def js_to_json(code, vars={}, *, strict=False):
2757 # vars is a dict of var, val pairs to substitute
2758 STRING_QUOTES = '\'"`'
2759 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2760 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2761 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2762 INTEGER_TABLE = (
2763 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2764 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2765 )
2766
2767 def process_escape(match):
2768 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2769 escape = match.group(1) or match.group(2)
2770
2771 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2772 else R'\u00' if escape == 'x'
2773 else '' if escape == '\n'
2774 else escape)
2775
2776 def template_substitute(match):
2777 evaluated = js_to_json(match.group(1), vars, strict=strict)
2778 if evaluated[0] == '"':
2779 return json.loads(evaluated)
2780 return evaluated
2781
2782 def fix_kv(m):
2783 v = m.group(0)
2784 if v in ('true', 'false', 'null'):
2785 return v
2786 elif v in ('undefined', 'void 0'):
2787 return 'null'
2788 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2789 return ''
2790
2791 if v[0] in STRING_QUOTES:
2792 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2793 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2794 return f'"{escaped}"'
2795
2796 for regex, base in INTEGER_TABLE:
2797 im = re.match(regex, v)
2798 if im:
2799 i = int(im.group(1), base)
2800 return f'"{i}":' if v.endswith(':') else str(i)
2801
2802 if v in vars:
2803 try:
2804 if not strict:
2805 json.loads(vars[v])
2806 except json.JSONDecodeError:
2807 return json.dumps(vars[v])
2808 else:
2809 return vars[v]
2810
2811 if not strict:
2812 return f'"{v}"'
2813
2814 raise ValueError(f'Unknown value: {v}')
2815
2816 def create_map(mobj):
2817 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2818
2819 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2820 if not strict:
2821 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
2822 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2823 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2824 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2825
2826 return re.sub(rf'''(?sx)
2827 {STRING_RE}|
2828 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2829 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2830 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2831 [0-9]+(?={SKIP_RE}:)|
2832 !+
2833 ''', fix_kv, code)
2834
2835
2836 def qualities(quality_ids):
2837 """ Get a numeric quality value out of a list of possible values """
2838 def q(qid):
2839 try:
2840 return quality_ids.index(qid)
2841 except ValueError:
2842 return -1
2843 return q
2844
2845
2846 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2847
2848
2849 DEFAULT_OUTTMPL = {
2850 'default': '%(title)s [%(id)s].%(ext)s',
2851 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2852 }
2853 OUTTMPL_TYPES = {
2854 'chapter': None,
2855 'subtitle': None,
2856 'thumbnail': None,
2857 'description': 'description',
2858 'annotation': 'annotations.xml',
2859 'infojson': 'info.json',
2860 'link': None,
2861 'pl_video': None,
2862 'pl_thumbnail': None,
2863 'pl_description': 'description',
2864 'pl_infojson': 'info.json',
2865 }
2866
2867 # As of [1] format syntax is:
2868 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2869 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2870 STR_FORMAT_RE_TMPL = r'''(?x)
2871 (?<!%)(?P<prefix>(?:%%)*)
2872 %
2873 (?P<has_key>\((?P<key>{0})\))?
2874 (?P<format>
2875 (?P<conversion>[#0\-+ ]+)?
2876 (?P<min_width>\d+)?
2877 (?P<precision>\.\d+)?
2878 (?P<len_mod>[hlL])? # unused in python
2879 {1} # conversion type
2880 )
2881 '''
2882
2883
2884 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2885
2886
2887 def limit_length(s, length):
2888 """ Add ellipses to overly long strings """
2889 if s is None:
2890 return None
2891 ELLIPSES = '...'
2892 if len(s) > length:
2893 return s[:length - len(ELLIPSES)] + ELLIPSES
2894 return s
2895
2896
2897 def version_tuple(v):
2898 return tuple(int(e) for e in re.split(r'[-.]', v))
2899
2900
2901 def is_outdated_version(version, limit, assume_new=True):
2902 if not version:
2903 return not assume_new
2904 try:
2905 return version_tuple(version) < version_tuple(limit)
2906 except ValueError:
2907 return not assume_new
2908
2909
2910 def ytdl_is_updateable():
2911 """ Returns if yt-dlp can be updated with -U """
2912
2913 from ..update import is_non_updateable
2914
2915 return not is_non_updateable()
2916
2917
2918 def args_to_str(args):
2919 # Get a short string representation for a subprocess command
2920 return ' '.join(compat_shlex_quote(a) for a in args)
2921
2922
2923 def error_to_str(err):
2924 return f'{type(err).__name__}: {err}'
2925
2926
2927 def mimetype2ext(mt, default=NO_DEFAULT):
2928 if not isinstance(mt, str):
2929 if default is not NO_DEFAULT:
2930 return default
2931 return None
2932
2933 MAP = {
2934 # video
2935 '3gpp': '3gp',
2936 'mp2t': 'ts',
2937 'mp4': 'mp4',
2938 'mpeg': 'mpeg',
2939 'mpegurl': 'm3u8',
2940 'quicktime': 'mov',
2941 'webm': 'webm',
2942 'vp9': 'vp9',
2943 'x-flv': 'flv',
2944 'x-m4v': 'm4v',
2945 'x-matroska': 'mkv',
2946 'x-mng': 'mng',
2947 'x-mp4-fragmented': 'mp4',
2948 'x-ms-asf': 'asf',
2949 'x-ms-wmv': 'wmv',
2950 'x-msvideo': 'avi',
2951
2952 # application (streaming playlists)
2953 'dash+xml': 'mpd',
2954 'f4m+xml': 'f4m',
2955 'hds+xml': 'f4m',
2956 'vnd.apple.mpegurl': 'm3u8',
2957 'vnd.ms-sstr+xml': 'ism',
2958 'x-mpegurl': 'm3u8',
2959
2960 # audio
2961 'audio/mp4': 'm4a',
2962 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2963 # Using .mp3 as it's the most popular one
2964 'audio/mpeg': 'mp3',
2965 'audio/webm': 'webm',
2966 'audio/x-matroska': 'mka',
2967 'audio/x-mpegurl': 'm3u',
2968 'midi': 'mid',
2969 'ogg': 'ogg',
2970 'wav': 'wav',
2971 'wave': 'wav',
2972 'x-aac': 'aac',
2973 'x-flac': 'flac',
2974 'x-m4a': 'm4a',
2975 'x-realaudio': 'ra',
2976 'x-wav': 'wav',
2977
2978 # image
2979 'avif': 'avif',
2980 'bmp': 'bmp',
2981 'gif': 'gif',
2982 'jpeg': 'jpg',
2983 'png': 'png',
2984 'svg+xml': 'svg',
2985 'tiff': 'tif',
2986 'vnd.wap.wbmp': 'wbmp',
2987 'webp': 'webp',
2988 'x-icon': 'ico',
2989 'x-jng': 'jng',
2990 'x-ms-bmp': 'bmp',
2991
2992 # caption
2993 'filmstrip+json': 'fs',
2994 'smptett+xml': 'tt',
2995 'ttaf+xml': 'dfxp',
2996 'ttml+xml': 'ttml',
2997 'x-ms-sami': 'sami',
2998
2999 # misc
3000 'gzip': 'gz',
3001 'json': 'json',
3002 'xml': 'xml',
3003 'zip': 'zip',
3004 }
3005
3006 mimetype = mt.partition(';')[0].strip().lower()
3007 _, _, subtype = mimetype.rpartition('/')
3008
3009 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3010 if ext:
3011 return ext
3012 elif default is not NO_DEFAULT:
3013 return default
3014 return subtype.replace('+', '.')
3015
3016
3017 def ext2mimetype(ext_or_url):
3018 if not ext_or_url:
3019 return None
3020 if '.' not in ext_or_url:
3021 ext_or_url = f'file.{ext_or_url}'
3022 return mimetypes.guess_type(ext_or_url)[0]
3023
3024
3025 def parse_codecs(codecs_str):
3026 # http://tools.ietf.org/html/rfc6381
3027 if not codecs_str:
3028 return {}
3029 split_codecs = list(filter(None, map(
3030 str.strip, codecs_str.strip().strip(',').split(','))))
3031 vcodec, acodec, scodec, hdr = None, None, None, None
3032 for full_codec in split_codecs:
3033 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3034 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3035 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3036 if vcodec:
3037 continue
3038 vcodec = full_codec
3039 if parts[0] in ('dvh1', 'dvhe'):
3040 hdr = 'DV'
3041 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3042 hdr = 'HDR10'
3043 elif parts[:2] == ['vp9', '2']:
3044 hdr = 'HDR10'
3045 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3046 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3047 acodec = acodec or full_codec
3048 elif parts[0] in ('stpp', 'wvtt'):
3049 scodec = scodec or full_codec
3050 else:
3051 write_string(f'WARNING: Unknown codec {full_codec}\n')
3052 if vcodec or acodec or scodec:
3053 return {
3054 'vcodec': vcodec or 'none',
3055 'acodec': acodec or 'none',
3056 'dynamic_range': hdr,
3057 **({'scodec': scodec} if scodec is not None else {}),
3058 }
3059 elif len(split_codecs) == 2:
3060 return {
3061 'vcodec': split_codecs[0],
3062 'acodec': split_codecs[1],
3063 }
3064 return {}
3065
3066
3067 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3068 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3069
3070 allow_mkv = not preferences or 'mkv' in preferences
3071
3072 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3073 return 'mkv' # TODO: any other format allows this?
3074
3075 # TODO: All codecs supported by parse_codecs isn't handled here
3076 COMPATIBLE_CODECS = {
3077 'mp4': {
3078 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3079 'h264', 'aacl', 'ec-3', # Set in ISM
3080 },
3081 'webm': {
3082 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3083 'vp9x', 'vp8x', # in the webm spec
3084 },
3085 }
3086
3087 sanitize_codec = functools.partial(
3088 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3089 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3090
3091 for ext in preferences or COMPATIBLE_CODECS.keys():
3092 codec_set = COMPATIBLE_CODECS.get(ext, set())
3093 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3094 return ext
3095
3096 COMPATIBLE_EXTS = (
3097 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3098 {'webm', 'weba'},
3099 )
3100 for ext in preferences or vexts:
3101 current_exts = {ext, *vexts, *aexts}
3102 if ext == 'mkv' or current_exts == {ext} or any(
3103 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3104 return ext
3105 return 'mkv' if allow_mkv else preferences[-1]
3106
3107
3108 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3109 getheader = url_handle.headers.get
3110
3111 cd = getheader('Content-Disposition')
3112 if cd:
3113 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3114 if m:
3115 e = determine_ext(m.group('filename'), default_ext=None)
3116 if e:
3117 return e
3118
3119 meta_ext = getheader('x-amz-meta-name')
3120 if meta_ext:
3121 e = meta_ext.rpartition('.')[2]
3122 if e:
3123 return e
3124
3125 return mimetype2ext(getheader('Content-Type'), default=default)
3126
3127
3128 def encode_data_uri(data, mime_type):
3129 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3130
3131
3132 def age_restricted(content_limit, age_limit):
3133 """ Returns True iff the content should be blocked """
3134
3135 if age_limit is None: # No limit set
3136 return False
3137 if content_limit is None:
3138 return False # Content available for everyone
3139 return age_limit < content_limit
3140
3141
3142 # List of known byte-order-marks (BOM)
3143 BOMS = [
3144 (b'\xef\xbb\xbf', 'utf-8'),
3145 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3146 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3147 (b'\xff\xfe', 'utf-16-le'),
3148 (b'\xfe\xff', 'utf-16-be'),
3149 ]
3150
3151
3152 def is_html(first_bytes):
3153 """ Detect whether a file contains HTML by examining its first bytes. """
3154
3155 encoding = 'utf-8'
3156 for bom, enc in BOMS:
3157 while first_bytes.startswith(bom):
3158 encoding, first_bytes = enc, first_bytes[len(bom):]
3159
3160 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3161
3162
3163 def determine_protocol(info_dict):
3164 protocol = info_dict.get('protocol')
3165 if protocol is not None:
3166 return protocol
3167
3168 url = sanitize_url(info_dict['url'])
3169 if url.startswith('rtmp'):
3170 return 'rtmp'
3171 elif url.startswith('mms'):
3172 return 'mms'
3173 elif url.startswith('rtsp'):
3174 return 'rtsp'
3175
3176 ext = determine_ext(url)
3177 if ext == 'm3u8':
3178 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3179 elif ext == 'f4m':
3180 return 'f4m'
3181
3182 return urllib.parse.urlparse(url).scheme
3183
3184
3185 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3186 """ Render a list of rows, each as a list of values.
3187 Text after a \t will be right aligned """
3188 def width(string):
3189 return len(remove_terminal_sequences(string).replace('\t', ''))
3190
3191 def get_max_lens(table):
3192 return [max(width(str(v)) for v in col) for col in zip(*table)]
3193
3194 def filter_using_list(row, filterArray):
3195 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3196
3197 max_lens = get_max_lens(data) if hide_empty else []
3198 header_row = filter_using_list(header_row, max_lens)
3199 data = [filter_using_list(row, max_lens) for row in data]
3200
3201 table = [header_row] + data
3202 max_lens = get_max_lens(table)
3203 extra_gap += 1
3204 if delim:
3205 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3206 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3207 for row in table:
3208 for pos, text in enumerate(map(str, row)):
3209 if '\t' in text:
3210 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3211 else:
3212 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3213 ret = '\n'.join(''.join(row).rstrip() for row in table)
3214 return ret
3215
3216
3217 def _match_one(filter_part, dct, incomplete):
3218 # TODO: Generalize code with YoutubeDL._build_format_filter
3219 STRING_OPERATORS = {
3220 '*=': operator.contains,
3221 '^=': lambda attr, value: attr.startswith(value),
3222 '$=': lambda attr, value: attr.endswith(value),
3223 '~=': lambda attr, value: re.search(value, attr),
3224 }
3225 COMPARISON_OPERATORS = {
3226 **STRING_OPERATORS,
3227 '<=': operator.le, # "<=" must be defined above "<"
3228 '<': operator.lt,
3229 '>=': operator.ge,
3230 '>': operator.gt,
3231 '=': operator.eq,
3232 }
3233
3234 if isinstance(incomplete, bool):
3235 is_incomplete = lambda _: incomplete
3236 else:
3237 is_incomplete = lambda k: k in incomplete
3238
3239 operator_rex = re.compile(r'''(?x)
3240 (?P<key>[a-z_]+)
3241 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3242 (?:
3243 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3244 (?P<strval>.+?)
3245 )
3246 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3247 m = operator_rex.fullmatch(filter_part.strip())
3248 if m:
3249 m = m.groupdict()
3250 unnegated_op = COMPARISON_OPERATORS[m['op']]
3251 if m['negation']:
3252 op = lambda attr, value: not unnegated_op(attr, value)
3253 else:
3254 op = unnegated_op
3255 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3256 if m['quote']:
3257 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3258 actual_value = dct.get(m['key'])
3259 numeric_comparison = None
3260 if isinstance(actual_value, (int, float)):
3261 # If the original field is a string and matching comparisonvalue is
3262 # a number we should respect the origin of the original field
3263 # and process comparison value as a string (see
3264 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3265 try:
3266 numeric_comparison = int(comparison_value)
3267 except ValueError:
3268 numeric_comparison = parse_filesize(comparison_value)
3269 if numeric_comparison is None:
3270 numeric_comparison = parse_filesize(f'{comparison_value}B')
3271 if numeric_comparison is None:
3272 numeric_comparison = parse_duration(comparison_value)
3273 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3274 raise ValueError('Operator %s only supports string values!' % m['op'])
3275 if actual_value is None:
3276 return is_incomplete(m['key']) or m['none_inclusive']
3277 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3278
3279 UNARY_OPERATORS = {
3280 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3281 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3282 }
3283 operator_rex = re.compile(r'''(?x)
3284 (?P<op>%s)\s*(?P<key>[a-z_]+)
3285 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3286 m = operator_rex.fullmatch(filter_part.strip())
3287 if m:
3288 op = UNARY_OPERATORS[m.group('op')]
3289 actual_value = dct.get(m.group('key'))
3290 if is_incomplete(m.group('key')) and actual_value is None:
3291 return True
3292 return op(actual_value)
3293
3294 raise ValueError('Invalid filter part %r' % filter_part)
3295
3296
3297 def match_str(filter_str, dct, incomplete=False):
3298 """ Filter a dictionary with a simple string syntax.
3299 @returns Whether the filter passes
3300 @param incomplete Set of keys that is expected to be missing from dct.
3301 Can be True/False to indicate all/none of the keys may be missing.
3302 All conditions on incomplete keys pass if the key is missing
3303 """
3304 return all(
3305 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3306 for filter_part in re.split(r'(?<!\\)&', filter_str))
3307
3308
3309 def match_filter_func(filters, breaking_filters=None):
3310 if not filters and not breaking_filters:
3311 return None
3312 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3313 filters = set(variadic(filters or []))
3314
3315 interactive = '-' in filters
3316 if interactive:
3317 filters.remove('-')
3318
3319 def _match_func(info_dict, incomplete=False):
3320 ret = breaking_filters(info_dict, incomplete)
3321 if ret is not None:
3322 raise RejectedVideoReached(ret)
3323
3324 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3325 return NO_DEFAULT if interactive and not incomplete else None
3326 else:
3327 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3328 filter_str = ') | ('.join(map(str.strip, filters))
3329 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3330 return _match_func
3331
3332
3333 class download_range_func:
3334 def __init__(self, chapters, ranges, from_info=False):
3335 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3336
3337 def __call__(self, info_dict, ydl):
3338
3339 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3340 else 'Cannot match chapters since chapter information is unavailable')
3341 for regex in self.chapters or []:
3342 for i, chapter in enumerate(info_dict.get('chapters') or []):
3343 if re.search(regex, chapter['title']):
3344 warning = None
3345 yield {**chapter, 'index': i}
3346 if self.chapters and warning:
3347 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3348
3349 for start, end in self.ranges or []:
3350 yield {
3351 'start_time': self._handle_negative_timestamp(start, info_dict),
3352 'end_time': self._handle_negative_timestamp(end, info_dict),
3353 }
3354
3355 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3356 yield {
3357 'start_time': info_dict.get('start_time') or 0,
3358 'end_time': info_dict.get('end_time') or float('inf'),
3359 }
3360 elif not self.ranges and not self.chapters:
3361 yield {}
3362
3363 @staticmethod
3364 def _handle_negative_timestamp(time, info):
3365 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3366
3367 def __eq__(self, other):
3368 return (isinstance(other, download_range_func)
3369 and self.chapters == other.chapters and self.ranges == other.ranges)
3370
3371 def __repr__(self):
3372 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3373
3374
3375 def parse_dfxp_time_expr(time_expr):
3376 if not time_expr:
3377 return
3378
3379 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3380 if mobj:
3381 return float(mobj.group('time_offset'))
3382
3383 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3384 if mobj:
3385 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3386
3387
3388 def srt_subtitles_timecode(seconds):
3389 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3390
3391
3392 def ass_subtitles_timecode(seconds):
3393 time = timetuple_from_msec(seconds * 1000)
3394 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3395
3396
3397 def dfxp2srt(dfxp_data):
3398 '''
3399 @param dfxp_data A bytes-like object containing DFXP data
3400 @returns A unicode object containing converted SRT data
3401 '''
3402 LEGACY_NAMESPACES = (
3403 (b'http://www.w3.org/ns/ttml', [
3404 b'http://www.w3.org/2004/11/ttaf1',
3405 b'http://www.w3.org/2006/04/ttaf1',
3406 b'http://www.w3.org/2006/10/ttaf1',
3407 ]),
3408 (b'http://www.w3.org/ns/ttml#styling', [
3409 b'http://www.w3.org/ns/ttml#style',
3410 ]),
3411 )
3412
3413 SUPPORTED_STYLING = [
3414 'color',
3415 'fontFamily',
3416 'fontSize',
3417 'fontStyle',
3418 'fontWeight',
3419 'textDecoration'
3420 ]
3421
3422 _x = functools.partial(xpath_with_ns, ns_map={
3423 'xml': 'http://www.w3.org/XML/1998/namespace',
3424 'ttml': 'http://www.w3.org/ns/ttml',
3425 'tts': 'http://www.w3.org/ns/ttml#styling',
3426 })
3427
3428 styles = {}
3429 default_style = {}
3430
3431 class TTMLPElementParser:
3432 _out = ''
3433 _unclosed_elements = []
3434 _applied_styles = []
3435
3436 def start(self, tag, attrib):
3437 if tag in (_x('ttml:br'), 'br'):
3438 self._out += '\n'
3439 else:
3440 unclosed_elements = []
3441 style = {}
3442 element_style_id = attrib.get('style')
3443 if default_style:
3444 style.update(default_style)
3445 if element_style_id:
3446 style.update(styles.get(element_style_id, {}))
3447 for prop in SUPPORTED_STYLING:
3448 prop_val = attrib.get(_x('tts:' + prop))
3449 if prop_val:
3450 style[prop] = prop_val
3451 if style:
3452 font = ''
3453 for k, v in sorted(style.items()):
3454 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3455 continue
3456 if k == 'color':
3457 font += ' color="%s"' % v
3458 elif k == 'fontSize':
3459 font += ' size="%s"' % v
3460 elif k == 'fontFamily':
3461 font += ' face="%s"' % v
3462 elif k == 'fontWeight' and v == 'bold':
3463 self._out += '<b>'
3464 unclosed_elements.append('b')
3465 elif k == 'fontStyle' and v == 'italic':
3466 self._out += '<i>'
3467 unclosed_elements.append('i')
3468 elif k == 'textDecoration' and v == 'underline':
3469 self._out += '<u>'
3470 unclosed_elements.append('u')
3471 if font:
3472 self._out += '<font' + font + '>'
3473 unclosed_elements.append('font')
3474 applied_style = {}
3475 if self._applied_styles:
3476 applied_style.update(self._applied_styles[-1])
3477 applied_style.update(style)
3478 self._applied_styles.append(applied_style)
3479 self._unclosed_elements.append(unclosed_elements)
3480
3481 def end(self, tag):
3482 if tag not in (_x('ttml:br'), 'br'):
3483 unclosed_elements = self._unclosed_elements.pop()
3484 for element in reversed(unclosed_elements):
3485 self._out += '</%s>' % element
3486 if unclosed_elements and self._applied_styles:
3487 self._applied_styles.pop()
3488
3489 def data(self, data):
3490 self._out += data
3491
3492 def close(self):
3493 return self._out.strip()
3494
3495 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3496 # This will not trigger false positives since only UTF-8 text is being replaced
3497 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3498
3499 def parse_node(node):
3500 target = TTMLPElementParser()
3501 parser = xml.etree.ElementTree.XMLParser(target=target)
3502 parser.feed(xml.etree.ElementTree.tostring(node))
3503 return parser.close()
3504
3505 for k, v in LEGACY_NAMESPACES:
3506 for ns in v:
3507 dfxp_data = dfxp_data.replace(ns, k)
3508
3509 dfxp = compat_etree_fromstring(dfxp_data)
3510 out = []
3511 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3512
3513 if not paras:
3514 raise ValueError('Invalid dfxp/TTML subtitle')
3515
3516 repeat = False
3517 while True:
3518 for style in dfxp.findall(_x('.//ttml:style')):
3519 style_id = style.get('id') or style.get(_x('xml:id'))
3520 if not style_id:
3521 continue
3522 parent_style_id = style.get('style')
3523 if parent_style_id:
3524 if parent_style_id not in styles:
3525 repeat = True
3526 continue
3527 styles[style_id] = styles[parent_style_id].copy()
3528 for prop in SUPPORTED_STYLING:
3529 prop_val = style.get(_x('tts:' + prop))
3530 if prop_val:
3531 styles.setdefault(style_id, {})[prop] = prop_val
3532 if repeat:
3533 repeat = False
3534 else:
3535 break
3536
3537 for p in ('body', 'div'):
3538 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3539 if ele is None:
3540 continue
3541 style = styles.get(ele.get('style'))
3542 if not style:
3543 continue
3544 default_style.update(style)
3545
3546 for para, index in zip(paras, itertools.count(1)):
3547 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3548 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3549 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3550 if begin_time is None:
3551 continue
3552 if not end_time:
3553 if not dur:
3554 continue
3555 end_time = begin_time + dur
3556 out.append('%d\n%s --> %s\n%s\n\n' % (
3557 index,
3558 srt_subtitles_timecode(begin_time),
3559 srt_subtitles_timecode(end_time),
3560 parse_node(para)))
3561
3562 return ''.join(out)
3563
3564
3565 def cli_option(params, command_option, param, separator=None):
3566 param = params.get(param)
3567 return ([] if param is None
3568 else [command_option, str(param)] if separator is None
3569 else [f'{command_option}{separator}{param}'])
3570
3571
3572 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3573 param = params.get(param)
3574 assert param in (True, False, None)
3575 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3576
3577
3578 def cli_valueless_option(params, command_option, param, expected_value=True):
3579 return [command_option] if params.get(param) == expected_value else []
3580
3581
3582 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3583 if isinstance(argdict, (list, tuple)): # for backward compatibility
3584 if use_compat:
3585 return argdict
3586 else:
3587 argdict = None
3588 if argdict is None:
3589 return default
3590 assert isinstance(argdict, dict)
3591
3592 assert isinstance(keys, (list, tuple))
3593 for key_list in keys:
3594 arg_list = list(filter(
3595 lambda x: x is not None,
3596 [argdict.get(key.lower()) for key in variadic(key_list)]))
3597 if arg_list:
3598 return [arg for args in arg_list for arg in args]
3599 return default
3600
3601
3602 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3603 main_key, exe = main_key.lower(), exe.lower()
3604 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3605 keys = [f'{root_key}{k}' for k in (keys or [''])]
3606 if root_key in keys:
3607 if main_key != exe:
3608 keys.append((main_key, exe))
3609 keys.append('default')
3610 else:
3611 use_compat = False
3612 return cli_configuration_args(argdict, keys, default, use_compat)
3613
3614
3615 class ISO639Utils:
3616 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3617 _lang_map = {
3618 'aa': 'aar',
3619 'ab': 'abk',
3620 'ae': 'ave',
3621 'af': 'afr',
3622 'ak': 'aka',
3623 'am': 'amh',
3624 'an': 'arg',
3625 'ar': 'ara',
3626 'as': 'asm',
3627 'av': 'ava',
3628 'ay': 'aym',
3629 'az': 'aze',
3630 'ba': 'bak',
3631 'be': 'bel',
3632 'bg': 'bul',
3633 'bh': 'bih',
3634 'bi': 'bis',
3635 'bm': 'bam',
3636 'bn': 'ben',
3637 'bo': 'bod',
3638 'br': 'bre',
3639 'bs': 'bos',
3640 'ca': 'cat',
3641 'ce': 'che',
3642 'ch': 'cha',
3643 'co': 'cos',
3644 'cr': 'cre',
3645 'cs': 'ces',
3646 'cu': 'chu',
3647 'cv': 'chv',
3648 'cy': 'cym',
3649 'da': 'dan',
3650 'de': 'deu',
3651 'dv': 'div',
3652 'dz': 'dzo',
3653 'ee': 'ewe',
3654 'el': 'ell',
3655 'en': 'eng',
3656 'eo': 'epo',
3657 'es': 'spa',
3658 'et': 'est',
3659 'eu': 'eus',
3660 'fa': 'fas',
3661 'ff': 'ful',
3662 'fi': 'fin',
3663 'fj': 'fij',
3664 'fo': 'fao',
3665 'fr': 'fra',
3666 'fy': 'fry',
3667 'ga': 'gle',
3668 'gd': 'gla',
3669 'gl': 'glg',
3670 'gn': 'grn',
3671 'gu': 'guj',
3672 'gv': 'glv',
3673 'ha': 'hau',
3674 'he': 'heb',
3675 'iw': 'heb', # Replaced by he in 1989 revision
3676 'hi': 'hin',
3677 'ho': 'hmo',
3678 'hr': 'hrv',
3679 'ht': 'hat',
3680 'hu': 'hun',
3681 'hy': 'hye',
3682 'hz': 'her',
3683 'ia': 'ina',
3684 'id': 'ind',
3685 'in': 'ind', # Replaced by id in 1989 revision
3686 'ie': 'ile',
3687 'ig': 'ibo',
3688 'ii': 'iii',
3689 'ik': 'ipk',
3690 'io': 'ido',
3691 'is': 'isl',
3692 'it': 'ita',
3693 'iu': 'iku',
3694 'ja': 'jpn',
3695 'jv': 'jav',
3696 'ka': 'kat',
3697 'kg': 'kon',
3698 'ki': 'kik',
3699 'kj': 'kua',
3700 'kk': 'kaz',
3701 'kl': 'kal',
3702 'km': 'khm',
3703 'kn': 'kan',
3704 'ko': 'kor',
3705 'kr': 'kau',
3706 'ks': 'kas',
3707 'ku': 'kur',
3708 'kv': 'kom',
3709 'kw': 'cor',
3710 'ky': 'kir',
3711 'la': 'lat',
3712 'lb': 'ltz',
3713 'lg': 'lug',
3714 'li': 'lim',
3715 'ln': 'lin',
3716 'lo': 'lao',
3717 'lt': 'lit',
3718 'lu': 'lub',
3719 'lv': 'lav',
3720 'mg': 'mlg',
3721 'mh': 'mah',
3722 'mi': 'mri',
3723 'mk': 'mkd',
3724 'ml': 'mal',
3725 'mn': 'mon',
3726 'mr': 'mar',
3727 'ms': 'msa',
3728 'mt': 'mlt',
3729 'my': 'mya',
3730 'na': 'nau',
3731 'nb': 'nob',
3732 'nd': 'nde',
3733 'ne': 'nep',
3734 'ng': 'ndo',
3735 'nl': 'nld',
3736 'nn': 'nno',
3737 'no': 'nor',
3738 'nr': 'nbl',
3739 'nv': 'nav',
3740 'ny': 'nya',
3741 'oc': 'oci',
3742 'oj': 'oji',
3743 'om': 'orm',
3744 'or': 'ori',
3745 'os': 'oss',
3746 'pa': 'pan',
3747 'pe': 'per',
3748 'pi': 'pli',
3749 'pl': 'pol',
3750 'ps': 'pus',
3751 'pt': 'por',
3752 'qu': 'que',
3753 'rm': 'roh',
3754 'rn': 'run',
3755 'ro': 'ron',
3756 'ru': 'rus',
3757 'rw': 'kin',
3758 'sa': 'san',
3759 'sc': 'srd',
3760 'sd': 'snd',
3761 'se': 'sme',
3762 'sg': 'sag',
3763 'si': 'sin',
3764 'sk': 'slk',
3765 'sl': 'slv',
3766 'sm': 'smo',
3767 'sn': 'sna',
3768 'so': 'som',
3769 'sq': 'sqi',
3770 'sr': 'srp',
3771 'ss': 'ssw',
3772 'st': 'sot',
3773 'su': 'sun',
3774 'sv': 'swe',
3775 'sw': 'swa',
3776 'ta': 'tam',
3777 'te': 'tel',
3778 'tg': 'tgk',
3779 'th': 'tha',
3780 'ti': 'tir',
3781 'tk': 'tuk',
3782 'tl': 'tgl',
3783 'tn': 'tsn',
3784 'to': 'ton',
3785 'tr': 'tur',
3786 'ts': 'tso',
3787 'tt': 'tat',
3788 'tw': 'twi',
3789 'ty': 'tah',
3790 'ug': 'uig',
3791 'uk': 'ukr',
3792 'ur': 'urd',
3793 'uz': 'uzb',
3794 've': 'ven',
3795 'vi': 'vie',
3796 'vo': 'vol',
3797 'wa': 'wln',
3798 'wo': 'wol',
3799 'xh': 'xho',
3800 'yi': 'yid',
3801 'ji': 'yid', # Replaced by yi in 1989 revision
3802 'yo': 'yor',
3803 'za': 'zha',
3804 'zh': 'zho',
3805 'zu': 'zul',
3806 }
3807
3808 @classmethod
3809 def short2long(cls, code):
3810 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3811 return cls._lang_map.get(code[:2])
3812
3813 @classmethod
3814 def long2short(cls, code):
3815 """Convert language code from ISO 639-2/T to ISO 639-1"""
3816 for short_name, long_name in cls._lang_map.items():
3817 if long_name == code:
3818 return short_name
3819
3820
3821 class ISO3166Utils:
3822 # From http://data.okfn.org/data/core/country-list
3823 _country_map = {
3824 'AF': 'Afghanistan',
3825 'AX': 'Åland Islands',
3826 'AL': 'Albania',
3827 'DZ': 'Algeria',
3828 'AS': 'American Samoa',
3829 'AD': 'Andorra',
3830 'AO': 'Angola',
3831 'AI': 'Anguilla',
3832 'AQ': 'Antarctica',
3833 'AG': 'Antigua and Barbuda',
3834 'AR': 'Argentina',
3835 'AM': 'Armenia',
3836 'AW': 'Aruba',
3837 'AU': 'Australia',
3838 'AT': 'Austria',
3839 'AZ': 'Azerbaijan',
3840 'BS': 'Bahamas',
3841 'BH': 'Bahrain',
3842 'BD': 'Bangladesh',
3843 'BB': 'Barbados',
3844 'BY': 'Belarus',
3845 'BE': 'Belgium',
3846 'BZ': 'Belize',
3847 'BJ': 'Benin',
3848 'BM': 'Bermuda',
3849 'BT': 'Bhutan',
3850 'BO': 'Bolivia, Plurinational State of',
3851 'BQ': 'Bonaire, Sint Eustatius and Saba',
3852 'BA': 'Bosnia and Herzegovina',
3853 'BW': 'Botswana',
3854 'BV': 'Bouvet Island',
3855 'BR': 'Brazil',
3856 'IO': 'British Indian Ocean Territory',
3857 'BN': 'Brunei Darussalam',
3858 'BG': 'Bulgaria',
3859 'BF': 'Burkina Faso',
3860 'BI': 'Burundi',
3861 'KH': 'Cambodia',
3862 'CM': 'Cameroon',
3863 'CA': 'Canada',
3864 'CV': 'Cape Verde',
3865 'KY': 'Cayman Islands',
3866 'CF': 'Central African Republic',
3867 'TD': 'Chad',
3868 'CL': 'Chile',
3869 'CN': 'China',
3870 'CX': 'Christmas Island',
3871 'CC': 'Cocos (Keeling) Islands',
3872 'CO': 'Colombia',
3873 'KM': 'Comoros',
3874 'CG': 'Congo',
3875 'CD': 'Congo, the Democratic Republic of the',
3876 'CK': 'Cook Islands',
3877 'CR': 'Costa Rica',
3878 'CI': 'Côte d\'Ivoire',
3879 'HR': 'Croatia',
3880 'CU': 'Cuba',
3881 'CW': 'Curaçao',
3882 'CY': 'Cyprus',
3883 'CZ': 'Czech Republic',
3884 'DK': 'Denmark',
3885 'DJ': 'Djibouti',
3886 'DM': 'Dominica',
3887 'DO': 'Dominican Republic',
3888 'EC': 'Ecuador',
3889 'EG': 'Egypt',
3890 'SV': 'El Salvador',
3891 'GQ': 'Equatorial Guinea',
3892 'ER': 'Eritrea',
3893 'EE': 'Estonia',
3894 'ET': 'Ethiopia',
3895 'FK': 'Falkland Islands (Malvinas)',
3896 'FO': 'Faroe Islands',
3897 'FJ': 'Fiji',
3898 'FI': 'Finland',
3899 'FR': 'France',
3900 'GF': 'French Guiana',
3901 'PF': 'French Polynesia',
3902 'TF': 'French Southern Territories',
3903 'GA': 'Gabon',
3904 'GM': 'Gambia',
3905 'GE': 'Georgia',
3906 'DE': 'Germany',
3907 'GH': 'Ghana',
3908 'GI': 'Gibraltar',
3909 'GR': 'Greece',
3910 'GL': 'Greenland',
3911 'GD': 'Grenada',
3912 'GP': 'Guadeloupe',
3913 'GU': 'Guam',
3914 'GT': 'Guatemala',
3915 'GG': 'Guernsey',
3916 'GN': 'Guinea',
3917 'GW': 'Guinea-Bissau',
3918 'GY': 'Guyana',
3919 'HT': 'Haiti',
3920 'HM': 'Heard Island and McDonald Islands',
3921 'VA': 'Holy See (Vatican City State)',
3922 'HN': 'Honduras',
3923 'HK': 'Hong Kong',
3924 'HU': 'Hungary',
3925 'IS': 'Iceland',
3926 'IN': 'India',
3927 'ID': 'Indonesia',
3928 'IR': 'Iran, Islamic Republic of',
3929 'IQ': 'Iraq',
3930 'IE': 'Ireland',
3931 'IM': 'Isle of Man',
3932 'IL': 'Israel',
3933 'IT': 'Italy',
3934 'JM': 'Jamaica',
3935 'JP': 'Japan',
3936 'JE': 'Jersey',
3937 'JO': 'Jordan',
3938 'KZ': 'Kazakhstan',
3939 'KE': 'Kenya',
3940 'KI': 'Kiribati',
3941 'KP': 'Korea, Democratic People\'s Republic of',
3942 'KR': 'Korea, Republic of',
3943 'KW': 'Kuwait',
3944 'KG': 'Kyrgyzstan',
3945 'LA': 'Lao People\'s Democratic Republic',
3946 'LV': 'Latvia',
3947 'LB': 'Lebanon',
3948 'LS': 'Lesotho',
3949 'LR': 'Liberia',
3950 'LY': 'Libya',
3951 'LI': 'Liechtenstein',
3952 'LT': 'Lithuania',
3953 'LU': 'Luxembourg',
3954 'MO': 'Macao',
3955 'MK': 'Macedonia, the Former Yugoslav Republic of',
3956 'MG': 'Madagascar',
3957 'MW': 'Malawi',
3958 'MY': 'Malaysia',
3959 'MV': 'Maldives',
3960 'ML': 'Mali',
3961 'MT': 'Malta',
3962 'MH': 'Marshall Islands',
3963 'MQ': 'Martinique',
3964 'MR': 'Mauritania',
3965 'MU': 'Mauritius',
3966 'YT': 'Mayotte',
3967 'MX': 'Mexico',
3968 'FM': 'Micronesia, Federated States of',
3969 'MD': 'Moldova, Republic of',
3970 'MC': 'Monaco',
3971 'MN': 'Mongolia',
3972 'ME': 'Montenegro',
3973 'MS': 'Montserrat',
3974 'MA': 'Morocco',
3975 'MZ': 'Mozambique',
3976 'MM': 'Myanmar',
3977 'NA': 'Namibia',
3978 'NR': 'Nauru',
3979 'NP': 'Nepal',
3980 'NL': 'Netherlands',
3981 'NC': 'New Caledonia',
3982 'NZ': 'New Zealand',
3983 'NI': 'Nicaragua',
3984 'NE': 'Niger',
3985 'NG': 'Nigeria',
3986 'NU': 'Niue',
3987 'NF': 'Norfolk Island',
3988 'MP': 'Northern Mariana Islands',
3989 'NO': 'Norway',
3990 'OM': 'Oman',
3991 'PK': 'Pakistan',
3992 'PW': 'Palau',
3993 'PS': 'Palestine, State of',
3994 'PA': 'Panama',
3995 'PG': 'Papua New Guinea',
3996 'PY': 'Paraguay',
3997 'PE': 'Peru',
3998 'PH': 'Philippines',
3999 'PN': 'Pitcairn',
4000 'PL': 'Poland',
4001 'PT': 'Portugal',
4002 'PR': 'Puerto Rico',
4003 'QA': 'Qatar',
4004 'RE': 'Réunion',
4005 'RO': 'Romania',
4006 'RU': 'Russian Federation',
4007 'RW': 'Rwanda',
4008 'BL': 'Saint Barthélemy',
4009 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4010 'KN': 'Saint Kitts and Nevis',
4011 'LC': 'Saint Lucia',
4012 'MF': 'Saint Martin (French part)',
4013 'PM': 'Saint Pierre and Miquelon',
4014 'VC': 'Saint Vincent and the Grenadines',
4015 'WS': 'Samoa',
4016 'SM': 'San Marino',
4017 'ST': 'Sao Tome and Principe',
4018 'SA': 'Saudi Arabia',
4019 'SN': 'Senegal',
4020 'RS': 'Serbia',
4021 'SC': 'Seychelles',
4022 'SL': 'Sierra Leone',
4023 'SG': 'Singapore',
4024 'SX': 'Sint Maarten (Dutch part)',
4025 'SK': 'Slovakia',
4026 'SI': 'Slovenia',
4027 'SB': 'Solomon Islands',
4028 'SO': 'Somalia',
4029 'ZA': 'South Africa',
4030 'GS': 'South Georgia and the South Sandwich Islands',
4031 'SS': 'South Sudan',
4032 'ES': 'Spain',
4033 'LK': 'Sri Lanka',
4034 'SD': 'Sudan',
4035 'SR': 'Suriname',
4036 'SJ': 'Svalbard and Jan Mayen',
4037 'SZ': 'Swaziland',
4038 'SE': 'Sweden',
4039 'CH': 'Switzerland',
4040 'SY': 'Syrian Arab Republic',
4041 'TW': 'Taiwan, Province of China',
4042 'TJ': 'Tajikistan',
4043 'TZ': 'Tanzania, United Republic of',
4044 'TH': 'Thailand',
4045 'TL': 'Timor-Leste',
4046 'TG': 'Togo',
4047 'TK': 'Tokelau',
4048 'TO': 'Tonga',
4049 'TT': 'Trinidad and Tobago',
4050 'TN': 'Tunisia',
4051 'TR': 'Turkey',
4052 'TM': 'Turkmenistan',
4053 'TC': 'Turks and Caicos Islands',
4054 'TV': 'Tuvalu',
4055 'UG': 'Uganda',
4056 'UA': 'Ukraine',
4057 'AE': 'United Arab Emirates',
4058 'GB': 'United Kingdom',
4059 'US': 'United States',
4060 'UM': 'United States Minor Outlying Islands',
4061 'UY': 'Uruguay',
4062 'UZ': 'Uzbekistan',
4063 'VU': 'Vanuatu',
4064 'VE': 'Venezuela, Bolivarian Republic of',
4065 'VN': 'Viet Nam',
4066 'VG': 'Virgin Islands, British',
4067 'VI': 'Virgin Islands, U.S.',
4068 'WF': 'Wallis and Futuna',
4069 'EH': 'Western Sahara',
4070 'YE': 'Yemen',
4071 'ZM': 'Zambia',
4072 'ZW': 'Zimbabwe',
4073 # Not ISO 3166 codes, but used for IP blocks
4074 'AP': 'Asia/Pacific Region',
4075 'EU': 'Europe',
4076 }
4077
4078 @classmethod
4079 def short2full(cls, code):
4080 """Convert an ISO 3166-2 country code to the corresponding full name"""
4081 return cls._country_map.get(code.upper())
4082
4083
4084 class GeoUtils:
4085 # Major IPv4 address blocks per country
4086 _country_ip_map = {
4087 'AD': '46.172.224.0/19',
4088 'AE': '94.200.0.0/13',
4089 'AF': '149.54.0.0/17',
4090 'AG': '209.59.64.0/18',
4091 'AI': '204.14.248.0/21',
4092 'AL': '46.99.0.0/16',
4093 'AM': '46.70.0.0/15',
4094 'AO': '105.168.0.0/13',
4095 'AP': '182.50.184.0/21',
4096 'AQ': '23.154.160.0/24',
4097 'AR': '181.0.0.0/12',
4098 'AS': '202.70.112.0/20',
4099 'AT': '77.116.0.0/14',
4100 'AU': '1.128.0.0/11',
4101 'AW': '181.41.0.0/18',
4102 'AX': '185.217.4.0/22',
4103 'AZ': '5.197.0.0/16',
4104 'BA': '31.176.128.0/17',
4105 'BB': '65.48.128.0/17',
4106 'BD': '114.130.0.0/16',
4107 'BE': '57.0.0.0/8',
4108 'BF': '102.178.0.0/15',
4109 'BG': '95.42.0.0/15',
4110 'BH': '37.131.0.0/17',
4111 'BI': '154.117.192.0/18',
4112 'BJ': '137.255.0.0/16',
4113 'BL': '185.212.72.0/23',
4114 'BM': '196.12.64.0/18',
4115 'BN': '156.31.0.0/16',
4116 'BO': '161.56.0.0/16',
4117 'BQ': '161.0.80.0/20',
4118 'BR': '191.128.0.0/12',
4119 'BS': '24.51.64.0/18',
4120 'BT': '119.2.96.0/19',
4121 'BW': '168.167.0.0/16',
4122 'BY': '178.120.0.0/13',
4123 'BZ': '179.42.192.0/18',
4124 'CA': '99.224.0.0/11',
4125 'CD': '41.243.0.0/16',
4126 'CF': '197.242.176.0/21',
4127 'CG': '160.113.0.0/16',
4128 'CH': '85.0.0.0/13',
4129 'CI': '102.136.0.0/14',
4130 'CK': '202.65.32.0/19',
4131 'CL': '152.172.0.0/14',
4132 'CM': '102.244.0.0/14',
4133 'CN': '36.128.0.0/10',
4134 'CO': '181.240.0.0/12',
4135 'CR': '201.192.0.0/12',
4136 'CU': '152.206.0.0/15',
4137 'CV': '165.90.96.0/19',
4138 'CW': '190.88.128.0/17',
4139 'CY': '31.153.0.0/16',
4140 'CZ': '88.100.0.0/14',
4141 'DE': '53.0.0.0/8',
4142 'DJ': '197.241.0.0/17',
4143 'DK': '87.48.0.0/12',
4144 'DM': '192.243.48.0/20',
4145 'DO': '152.166.0.0/15',
4146 'DZ': '41.96.0.0/12',
4147 'EC': '186.68.0.0/15',
4148 'EE': '90.190.0.0/15',
4149 'EG': '156.160.0.0/11',
4150 'ER': '196.200.96.0/20',
4151 'ES': '88.0.0.0/11',
4152 'ET': '196.188.0.0/14',
4153 'EU': '2.16.0.0/13',
4154 'FI': '91.152.0.0/13',
4155 'FJ': '144.120.0.0/16',
4156 'FK': '80.73.208.0/21',
4157 'FM': '119.252.112.0/20',
4158 'FO': '88.85.32.0/19',
4159 'FR': '90.0.0.0/9',
4160 'GA': '41.158.0.0/15',
4161 'GB': '25.0.0.0/8',
4162 'GD': '74.122.88.0/21',
4163 'GE': '31.146.0.0/16',
4164 'GF': '161.22.64.0/18',
4165 'GG': '62.68.160.0/19',
4166 'GH': '154.160.0.0/12',
4167 'GI': '95.164.0.0/16',
4168 'GL': '88.83.0.0/19',
4169 'GM': '160.182.0.0/15',
4170 'GN': '197.149.192.0/18',
4171 'GP': '104.250.0.0/19',
4172 'GQ': '105.235.224.0/20',
4173 'GR': '94.64.0.0/13',
4174 'GT': '168.234.0.0/16',
4175 'GU': '168.123.0.0/16',
4176 'GW': '197.214.80.0/20',
4177 'GY': '181.41.64.0/18',
4178 'HK': '113.252.0.0/14',
4179 'HN': '181.210.0.0/16',
4180 'HR': '93.136.0.0/13',
4181 'HT': '148.102.128.0/17',
4182 'HU': '84.0.0.0/14',
4183 'ID': '39.192.0.0/10',
4184 'IE': '87.32.0.0/12',
4185 'IL': '79.176.0.0/13',
4186 'IM': '5.62.80.0/20',
4187 'IN': '117.192.0.0/10',
4188 'IO': '203.83.48.0/21',
4189 'IQ': '37.236.0.0/14',
4190 'IR': '2.176.0.0/12',
4191 'IS': '82.221.0.0/16',
4192 'IT': '79.0.0.0/10',
4193 'JE': '87.244.64.0/18',
4194 'JM': '72.27.0.0/17',
4195 'JO': '176.29.0.0/16',
4196 'JP': '133.0.0.0/8',
4197 'KE': '105.48.0.0/12',
4198 'KG': '158.181.128.0/17',
4199 'KH': '36.37.128.0/17',
4200 'KI': '103.25.140.0/22',
4201 'KM': '197.255.224.0/20',
4202 'KN': '198.167.192.0/19',
4203 'KP': '175.45.176.0/22',
4204 'KR': '175.192.0.0/10',
4205 'KW': '37.36.0.0/14',
4206 'KY': '64.96.0.0/15',
4207 'KZ': '2.72.0.0/13',
4208 'LA': '115.84.64.0/18',
4209 'LB': '178.135.0.0/16',
4210 'LC': '24.92.144.0/20',
4211 'LI': '82.117.0.0/19',
4212 'LK': '112.134.0.0/15',
4213 'LR': '102.183.0.0/16',
4214 'LS': '129.232.0.0/17',
4215 'LT': '78.56.0.0/13',
4216 'LU': '188.42.0.0/16',
4217 'LV': '46.109.0.0/16',
4218 'LY': '41.252.0.0/14',
4219 'MA': '105.128.0.0/11',
4220 'MC': '88.209.64.0/18',
4221 'MD': '37.246.0.0/16',
4222 'ME': '178.175.0.0/17',
4223 'MF': '74.112.232.0/21',
4224 'MG': '154.126.0.0/17',
4225 'MH': '117.103.88.0/21',
4226 'MK': '77.28.0.0/15',
4227 'ML': '154.118.128.0/18',
4228 'MM': '37.111.0.0/17',
4229 'MN': '49.0.128.0/17',
4230 'MO': '60.246.0.0/16',
4231 'MP': '202.88.64.0/20',
4232 'MQ': '109.203.224.0/19',
4233 'MR': '41.188.64.0/18',
4234 'MS': '208.90.112.0/22',
4235 'MT': '46.11.0.0/16',
4236 'MU': '105.16.0.0/12',
4237 'MV': '27.114.128.0/18',
4238 'MW': '102.70.0.0/15',
4239 'MX': '187.192.0.0/11',
4240 'MY': '175.136.0.0/13',
4241 'MZ': '197.218.0.0/15',
4242 'NA': '41.182.0.0/16',
4243 'NC': '101.101.0.0/18',
4244 'NE': '197.214.0.0/18',
4245 'NF': '203.17.240.0/22',
4246 'NG': '105.112.0.0/12',
4247 'NI': '186.76.0.0/15',
4248 'NL': '145.96.0.0/11',
4249 'NO': '84.208.0.0/13',
4250 'NP': '36.252.0.0/15',
4251 'NR': '203.98.224.0/19',
4252 'NU': '49.156.48.0/22',
4253 'NZ': '49.224.0.0/14',
4254 'OM': '5.36.0.0/15',
4255 'PA': '186.72.0.0/15',
4256 'PE': '186.160.0.0/14',
4257 'PF': '123.50.64.0/18',
4258 'PG': '124.240.192.0/19',
4259 'PH': '49.144.0.0/13',
4260 'PK': '39.32.0.0/11',
4261 'PL': '83.0.0.0/11',
4262 'PM': '70.36.0.0/20',
4263 'PR': '66.50.0.0/16',
4264 'PS': '188.161.0.0/16',
4265 'PT': '85.240.0.0/13',
4266 'PW': '202.124.224.0/20',
4267 'PY': '181.120.0.0/14',
4268 'QA': '37.210.0.0/15',
4269 'RE': '102.35.0.0/16',
4270 'RO': '79.112.0.0/13',
4271 'RS': '93.86.0.0/15',
4272 'RU': '5.136.0.0/13',
4273 'RW': '41.186.0.0/16',
4274 'SA': '188.48.0.0/13',
4275 'SB': '202.1.160.0/19',
4276 'SC': '154.192.0.0/11',
4277 'SD': '102.120.0.0/13',
4278 'SE': '78.64.0.0/12',
4279 'SG': '8.128.0.0/10',
4280 'SI': '188.196.0.0/14',
4281 'SK': '78.98.0.0/15',
4282 'SL': '102.143.0.0/17',
4283 'SM': '89.186.32.0/19',
4284 'SN': '41.82.0.0/15',
4285 'SO': '154.115.192.0/18',
4286 'SR': '186.179.128.0/17',
4287 'SS': '105.235.208.0/21',
4288 'ST': '197.159.160.0/19',
4289 'SV': '168.243.0.0/16',
4290 'SX': '190.102.0.0/20',
4291 'SY': '5.0.0.0/16',
4292 'SZ': '41.84.224.0/19',
4293 'TC': '65.255.48.0/20',
4294 'TD': '154.68.128.0/19',
4295 'TG': '196.168.0.0/14',
4296 'TH': '171.96.0.0/13',
4297 'TJ': '85.9.128.0/18',
4298 'TK': '27.96.24.0/21',
4299 'TL': '180.189.160.0/20',
4300 'TM': '95.85.96.0/19',
4301 'TN': '197.0.0.0/11',
4302 'TO': '175.176.144.0/21',
4303 'TR': '78.160.0.0/11',
4304 'TT': '186.44.0.0/15',
4305 'TV': '202.2.96.0/19',
4306 'TW': '120.96.0.0/11',
4307 'TZ': '156.156.0.0/14',
4308 'UA': '37.52.0.0/14',
4309 'UG': '102.80.0.0/13',
4310 'US': '6.0.0.0/8',
4311 'UY': '167.56.0.0/13',
4312 'UZ': '84.54.64.0/18',
4313 'VA': '212.77.0.0/19',
4314 'VC': '207.191.240.0/21',
4315 'VE': '186.88.0.0/13',
4316 'VG': '66.81.192.0/20',
4317 'VI': '146.226.0.0/16',
4318 'VN': '14.160.0.0/11',
4319 'VU': '202.80.32.0/20',
4320 'WF': '117.20.32.0/21',
4321 'WS': '202.4.32.0/19',
4322 'YE': '134.35.0.0/16',
4323 'YT': '41.242.116.0/22',
4324 'ZA': '41.0.0.0/11',
4325 'ZM': '102.144.0.0/13',
4326 'ZW': '102.177.192.0/18',
4327 }
4328
4329 @classmethod
4330 def random_ipv4(cls, code_or_block):
4331 if len(code_or_block) == 2:
4332 block = cls._country_ip_map.get(code_or_block.upper())
4333 if not block:
4334 return None
4335 else:
4336 block = code_or_block
4337 addr, preflen = block.split('/')
4338 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4339 addr_max = addr_min | (0xffffffff >> int(preflen))
4340 return str(socket.inet_ntoa(
4341 struct.pack('!L', random.randint(addr_min, addr_max))))
4342
4343
4344 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4345 # released into Public Domain
4346 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4347
4348 def long_to_bytes(n, blocksize=0):
4349 """long_to_bytes(n:long, blocksize:int) : string
4350 Convert a long integer to a byte string.
4351
4352 If optional blocksize is given and greater than zero, pad the front of the
4353 byte string with binary zeros so that the length is a multiple of
4354 blocksize.
4355 """
4356 # after much testing, this algorithm was deemed to be the fastest
4357 s = b''
4358 n = int(n)
4359 while n > 0:
4360 s = struct.pack('>I', n & 0xffffffff) + s
4361 n = n >> 32
4362 # strip off leading zeros
4363 for i in range(len(s)):
4364 if s[i] != b'\000'[0]:
4365 break
4366 else:
4367 # only happens when n == 0
4368 s = b'\000'
4369 i = 0
4370 s = s[i:]
4371 # add back some pad bytes. this could be done more efficiently w.r.t. the
4372 # de-padding being done above, but sigh...
4373 if blocksize > 0 and len(s) % blocksize:
4374 s = (blocksize - len(s) % blocksize) * b'\000' + s
4375 return s
4376
4377
4378 def bytes_to_long(s):
4379 """bytes_to_long(string) : long
4380 Convert a byte string to a long integer.
4381
4382 This is (essentially) the inverse of long_to_bytes().
4383 """
4384 acc = 0
4385 length = len(s)
4386 if length % 4:
4387 extra = (4 - length % 4)
4388 s = b'\000' * extra + s
4389 length = length + extra
4390 for i in range(0, length, 4):
4391 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4392 return acc
4393
4394
4395 def ohdave_rsa_encrypt(data, exponent, modulus):
4396 '''
4397 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4398
4399 Input:
4400 data: data to encrypt, bytes-like object
4401 exponent, modulus: parameter e and N of RSA algorithm, both integer
4402 Output: hex string of encrypted data
4403
4404 Limitation: supports one block encryption only
4405 '''
4406
4407 payload = int(binascii.hexlify(data[::-1]), 16)
4408 encrypted = pow(payload, exponent, modulus)
4409 return '%x' % encrypted
4410
4411
4412 def pkcs1pad(data, length):
4413 """
4414 Padding input data with PKCS#1 scheme
4415
4416 @param {int[]} data input data
4417 @param {int} length target length
4418 @returns {int[]} padded data
4419 """
4420 if len(data) > length - 11:
4421 raise ValueError('Input data too long for PKCS#1 padding')
4422
4423 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4424 return [0, 2] + pseudo_random + [0] + data
4425
4426
4427 def _base_n_table(n, table):
4428 if not table and not n:
4429 raise ValueError('Either table or n must be specified')
4430 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4431
4432 if n and n != len(table):
4433 raise ValueError(f'base {n} exceeds table length {len(table)}')
4434 return table
4435
4436
4437 def encode_base_n(num, n=None, table=None):
4438 """Convert given int to a base-n string"""
4439 table = _base_n_table(n, table)
4440 if not num:
4441 return table[0]
4442
4443 result, base = '', len(table)
4444 while num:
4445 result = table[num % base] + result
4446 num = num // base
4447 return result
4448
4449
4450 def decode_base_n(string, n=None, table=None):
4451 """Convert given base-n string to int"""
4452 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4453 result, base = 0, len(table)
4454 for char in string:
4455 result = result * base + table[char]
4456 return result
4457
4458
4459 def decode_packed_codes(code):
4460 mobj = re.search(PACKED_CODES_RE, code)
4461 obfuscated_code, base, count, symbols = mobj.groups()
4462 base = int(base)
4463 count = int(count)
4464 symbols = symbols.split('|')
4465 symbol_table = {}
4466
4467 while count:
4468 count -= 1
4469 base_n_count = encode_base_n(count, base)
4470 symbol_table[base_n_count] = symbols[count] or base_n_count
4471
4472 return re.sub(
4473 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4474 obfuscated_code)
4475
4476
4477 def caesar(s, alphabet, shift):
4478 if shift == 0:
4479 return s
4480 l = len(alphabet)
4481 return ''.join(
4482 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4483 for c in s)
4484
4485
4486 def rot47(s):
4487 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4488
4489
4490 def parse_m3u8_attributes(attrib):
4491 info = {}
4492 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4493 if val.startswith('"'):
4494 val = val[1:-1]
4495 info[key] = val
4496 return info
4497
4498
4499 def urshift(val, n):
4500 return val >> n if val >= 0 else (val + 0x100000000) >> n
4501
4502
4503 def write_xattr(path, key, value):
4504 # Windows: Write xattrs to NTFS Alternate Data Streams:
4505 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4506 if compat_os_name == 'nt':
4507 assert ':' not in key
4508 assert os.path.exists(path)
4509
4510 try:
4511 with open(f'{path}:{key}', 'wb') as f:
4512 f.write(value)
4513 except OSError as e:
4514 raise XAttrMetadataError(e.errno, e.strerror)
4515 return
4516
4517 # UNIX Method 1. Use xattrs/pyxattrs modules
4518
4519 setxattr = None
4520 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4521 # Unicode arguments are not supported in pyxattr until version 0.5.0
4522 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4523 if version_tuple(xattr.__version__) >= (0, 5, 0):
4524 setxattr = xattr.set
4525 elif xattr:
4526 setxattr = xattr.setxattr
4527
4528 if setxattr:
4529 try:
4530 setxattr(path, key, value)
4531 except OSError as e:
4532 raise XAttrMetadataError(e.errno, e.strerror)
4533 return
4534
4535 # UNIX Method 2. Use setfattr/xattr executables
4536 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4537 else 'xattr' if check_executable('xattr', ['-h']) else None)
4538 if not exe:
4539 raise XAttrUnavailableError(
4540 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4541 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4542
4543 value = value.decode()
4544 try:
4545 _, stderr, returncode = Popen.run(
4546 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4547 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4548 except OSError as e:
4549 raise XAttrMetadataError(e.errno, e.strerror)
4550 if returncode:
4551 raise XAttrMetadataError(returncode, stderr)
4552
4553
4554 def random_birthday(year_field, month_field, day_field):
4555 start_date = datetime.date(1950, 1, 1)
4556 end_date = datetime.date(1995, 12, 31)
4557 offset = random.randint(0, (end_date - start_date).days)
4558 random_date = start_date + datetime.timedelta(offset)
4559 return {
4560 year_field: str(random_date.year),
4561 month_field: str(random_date.month),
4562 day_field: str(random_date.day),
4563 }
4564
4565
4566 def find_available_port(interface=''):
4567 try:
4568 with socket.socket() as sock:
4569 sock.bind((interface, 0))
4570 return sock.getsockname()[1]
4571 except OSError:
4572 return None
4573
4574
4575 # Templates for internet shortcut files, which are plain text files.
4576 DOT_URL_LINK_TEMPLATE = '''\
4577 [InternetShortcut]
4578 URL=%(url)s
4579 '''
4580
4581 DOT_WEBLOC_LINK_TEMPLATE = '''\
4582 <?xml version="1.0" encoding="UTF-8"?>
4583 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4584 <plist version="1.0">
4585 <dict>
4586 \t<key>URL</key>
4587 \t<string>%(url)s</string>
4588 </dict>
4589 </plist>
4590 '''
4591
4592 DOT_DESKTOP_LINK_TEMPLATE = '''\
4593 [Desktop Entry]
4594 Encoding=UTF-8
4595 Name=%(filename)s
4596 Type=Link
4597 URL=%(url)s
4598 Icon=text-html
4599 '''
4600
4601 LINK_TEMPLATES = {
4602 'url': DOT_URL_LINK_TEMPLATE,
4603 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4604 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4605 }
4606
4607
4608 def iri_to_uri(iri):
4609 """
4610 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4611
4612 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4613 """
4614
4615 iri_parts = urllib.parse.urlparse(iri)
4616
4617 if '[' in iri_parts.netloc:
4618 raise ValueError('IPv6 URIs are not, yet, supported.')
4619 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4620
4621 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4622
4623 net_location = ''
4624 if iri_parts.username:
4625 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4626 if iri_parts.password is not None:
4627 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4628 net_location += '@'
4629
4630 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
4631 # The 'idna' encoding produces ASCII text.
4632 if iri_parts.port is not None and iri_parts.port != 80:
4633 net_location += ':' + str(iri_parts.port)
4634
4635 return urllib.parse.urlunparse(
4636 (iri_parts.scheme,
4637 net_location,
4638
4639 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4640
4641 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4642 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4643
4644 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4645 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4646
4647 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4648
4649 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4650
4651
4652 def to_high_limit_path(path):
4653 if sys.platform in ['win32', 'cygwin']:
4654 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4655 return '\\\\?\\' + os.path.abspath(path)
4656
4657 return path
4658
4659
4660 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4661 val = traversal.traverse_obj(obj, *variadic(field))
4662 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4663 return default
4664 return template % func(val)
4665
4666
4667 def clean_podcast_url(url):
4668 url = re.sub(r'''(?x)
4669 (?:
4670 (?:
4671 chtbl\.com/track|
4672 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4673 play\.podtrac\.com|
4674 chrt\.fm/track|
4675 mgln\.ai/e
4676 )(?:/[^/.]+)?|
4677 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4678 flex\.acast\.com|
4679 pd(?:
4680 cn\.co| # https://podcorn.com/analytics-prefix/
4681 st\.fm # https://podsights.com/docs/
4682 )/e|
4683 [0-9]\.gum\.fm|
4684 pscrb\.fm/rss/p
4685 )/''', '', url)
4686 return re.sub(r'^\w+://(\w+://)', r'\1', url)
4687
4688
4689 _HEX_TABLE = '0123456789abcdef'
4690
4691
4692 def random_uuidv4():
4693 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4694
4695
4696 def make_dir(path, to_screen=None):
4697 try:
4698 dn = os.path.dirname(path)
4699 if dn:
4700 os.makedirs(dn, exist_ok=True)
4701 return True
4702 except OSError as err:
4703 if callable(to_screen) is not None:
4704 to_screen(f'unable to create directory {err}')
4705 return False
4706
4707
4708 def get_executable_path():
4709 from ..update import _get_variant_and_executable_path
4710
4711 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4712
4713
4714 def get_user_config_dirs(package_name):
4715 # .config (e.g. ~/.config/package_name)
4716 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4717 yield os.path.join(xdg_config_home, package_name)
4718
4719 # appdata (%APPDATA%/package_name)
4720 appdata_dir = os.getenv('appdata')
4721 if appdata_dir:
4722 yield os.path.join(appdata_dir, package_name)
4723
4724 # home (~/.package_name)
4725 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4726
4727
4728 def get_system_config_dirs(package_name):
4729 # /etc/package_name
4730 yield os.path.join('/etc', package_name)
4731
4732
4733 def time_seconds(**kwargs):
4734 """
4735 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4736 """
4737 return time.time() + datetime.timedelta(**kwargs).total_seconds()
4738
4739
4740 # create a JSON Web Signature (jws) with HS256 algorithm
4741 # the resulting format is in JWS Compact Serialization
4742 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4743 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4744 def jwt_encode_hs256(payload_data, key, headers={}):
4745 header_data = {
4746 'alg': 'HS256',
4747 'typ': 'JWT',
4748 }
4749 if headers:
4750 header_data.update(headers)
4751 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4752 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4753 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4754 signature_b64 = base64.b64encode(h.digest())
4755 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4756 return token
4757
4758
4759 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4760 def jwt_decode_hs256(jwt):
4761 header_b64, payload_b64, signature_b64 = jwt.split('.')
4762 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4763 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4764 return payload_data
4765
4766
4767 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4768
4769
4770 @functools.cache
4771 def supports_terminal_sequences(stream):
4772 if compat_os_name == 'nt':
4773 if not WINDOWS_VT_MODE:
4774 return False
4775 elif not os.getenv('TERM'):
4776 return False
4777 try:
4778 return stream.isatty()
4779 except BaseException:
4780 return False
4781
4782
4783 def windows_enable_vt_mode():
4784 """Ref: https://bugs.python.org/issue30075 """
4785 if get_windows_version() < (10, 0, 10586):
4786 return
4787
4788 import ctypes
4789 import ctypes.wintypes
4790 import msvcrt
4791
4792 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4793
4794 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4795 handle = os.open('CONOUT$', os.O_RDWR)
4796 try:
4797 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4798 dw_original_mode = ctypes.wintypes.DWORD()
4799 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4800 if not success:
4801 raise Exception('GetConsoleMode failed')
4802
4803 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4804 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4805 if not success:
4806 raise Exception('SetConsoleMode failed')
4807 finally:
4808 os.close(handle)
4809
4810 global WINDOWS_VT_MODE
4811 WINDOWS_VT_MODE = True
4812 supports_terminal_sequences.cache_clear()
4813
4814
4815 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4816
4817
4818 def remove_terminal_sequences(string):
4819 return _terminal_sequences_re.sub('', string)
4820
4821
4822 def number_of_digits(number):
4823 return len('%d' % number)
4824
4825
4826 def join_nonempty(*values, delim='-', from_dict=None):
4827 if from_dict is not None:
4828 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4829 return delim.join(map(str, filter(None, values)))
4830
4831
4832 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4833 """
4834 Find the largest format dimensions in terms of video width and, for each thumbnail:
4835 * Modify the URL: Match the width with the provided regex and replace with the former width
4836 * Update dimensions
4837
4838 This function is useful with video services that scale the provided thumbnails on demand
4839 """
4840 _keys = ('width', 'height')
4841 max_dimensions = max(
4842 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4843 default=(0, 0))
4844 if not max_dimensions[0]:
4845 return thumbnails
4846 return [
4847 merge_dicts(
4848 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4849 dict(zip(_keys, max_dimensions)), thumbnail)
4850 for thumbnail in thumbnails
4851 ]
4852
4853
4854 def parse_http_range(range):
4855 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4856 if not range:
4857 return None, None, None
4858 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4859 if not crg:
4860 return None, None, None
4861 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4862
4863
4864 def read_stdin(what):
4865 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4866 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4867 return sys.stdin
4868
4869
4870 def determine_file_encoding(data):
4871 """
4872 Detect the text encoding used
4873 @returns (encoding, bytes to skip)
4874 """
4875
4876 # BOM marks are given priority over declarations
4877 for bom, enc in BOMS:
4878 if data.startswith(bom):
4879 return enc, len(bom)
4880
4881 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4882 # We ignore the endianness to get a good enough match
4883 data = data.replace(b'\0', b'')
4884 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4885 return mobj.group(1).decode() if mobj else None, 0
4886
4887
4888 class Config:
4889 own_args = None
4890 parsed_args = None
4891 filename = None
4892 __initialized = False
4893
4894 def __init__(self, parser, label=None):
4895 self.parser, self.label = parser, label
4896 self._loaded_paths, self.configs = set(), []
4897
4898 def init(self, args=None, filename=None):
4899 assert not self.__initialized
4900 self.own_args, self.filename = args, filename
4901 return self.load_configs()
4902
4903 def load_configs(self):
4904 directory = ''
4905 if self.filename:
4906 location = os.path.realpath(self.filename)
4907 directory = os.path.dirname(location)
4908 if location in self._loaded_paths:
4909 return False
4910 self._loaded_paths.add(location)
4911
4912 self.__initialized = True
4913 opts, _ = self.parser.parse_known_args(self.own_args)
4914 self.parsed_args = self.own_args
4915 for location in opts.config_locations or []:
4916 if location == '-':
4917 if location in self._loaded_paths:
4918 continue
4919 self._loaded_paths.add(location)
4920 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4921 continue
4922 location = os.path.join(directory, expand_path(location))
4923 if os.path.isdir(location):
4924 location = os.path.join(location, 'yt-dlp.conf')
4925 if not os.path.exists(location):
4926 self.parser.error(f'config location {location} does not exist')
4927 self.append_config(self.read_file(location), location)
4928 return True
4929
4930 def __str__(self):
4931 label = join_nonempty(
4932 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4933 delim=' ')
4934 return join_nonempty(
4935 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4936 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4937 delim='\n')
4938
4939 @staticmethod
4940 def read_file(filename, default=[]):
4941 try:
4942 optionf = open(filename, 'rb')
4943 except OSError:
4944 return default # silently skip if file is not present
4945 try:
4946 enc, skip = determine_file_encoding(optionf.read(512))
4947 optionf.seek(skip, io.SEEK_SET)
4948 except OSError:
4949 enc = None # silently skip read errors
4950 try:
4951 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4952 contents = optionf.read().decode(enc or preferredencoding())
4953 res = shlex.split(contents, comments=True)
4954 except Exception as err:
4955 raise ValueError(f'Unable to parse "{filename}": {err}')
4956 finally:
4957 optionf.close()
4958 return res
4959
4960 @staticmethod
4961 def hide_login_info(opts):
4962 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4963 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4964
4965 def _scrub_eq(o):
4966 m = eqre.match(o)
4967 if m:
4968 return m.group('key') + '=PRIVATE'
4969 else:
4970 return o
4971
4972 opts = list(map(_scrub_eq, opts))
4973 for idx, opt in enumerate(opts):
4974 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4975 opts[idx + 1] = 'PRIVATE'
4976 return opts
4977
4978 def append_config(self, *args, label=None):
4979 config = type(self)(self.parser, label)
4980 config._loaded_paths = self._loaded_paths
4981 if config.init(*args):
4982 self.configs.append(config)
4983
4984 @property
4985 def all_args(self):
4986 for config in reversed(self.configs):
4987 yield from config.all_args
4988 yield from self.parsed_args or []
4989
4990 def parse_known_args(self, **kwargs):
4991 return self.parser.parse_known_args(self.all_args, **kwargs)
4992
4993 def parse_args(self):
4994 return self.parser.parse_args(self.all_args)
4995
4996
4997 class WebSocketsWrapper:
4998 """Wraps websockets module to use in non-async scopes"""
4999 pool = None
5000
5001 def __init__(self, url, headers=None, connect=True):
5002 self.loop = asyncio.new_event_loop()
5003 # XXX: "loop" is deprecated
5004 self.conn = websockets.connect(
5005 url, extra_headers=headers, ping_interval=None,
5006 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5007 if connect:
5008 self.__enter__()
5009 atexit.register(self.__exit__, None, None, None)
5010
5011 def __enter__(self):
5012 if not self.pool:
5013 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5014 return self
5015
5016 def send(self, *args):
5017 self.run_with_loop(self.pool.send(*args), self.loop)
5018
5019 def recv(self, *args):
5020 return self.run_with_loop(self.pool.recv(*args), self.loop)
5021
5022 def __exit__(self, type, value, traceback):
5023 try:
5024 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5025 finally:
5026 self.loop.close()
5027 self._cancel_all_tasks(self.loop)
5028
5029 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5030 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5031 @staticmethod
5032 def run_with_loop(main, loop):
5033 if not asyncio.iscoroutine(main):
5034 raise ValueError(f'a coroutine was expected, got {main!r}')
5035
5036 try:
5037 return loop.run_until_complete(main)
5038 finally:
5039 loop.run_until_complete(loop.shutdown_asyncgens())
5040 if hasattr(loop, 'shutdown_default_executor'):
5041 loop.run_until_complete(loop.shutdown_default_executor())
5042
5043 @staticmethod
5044 def _cancel_all_tasks(loop):
5045 to_cancel = asyncio.all_tasks(loop)
5046
5047 if not to_cancel:
5048 return
5049
5050 for task in to_cancel:
5051 task.cancel()
5052
5053 # XXX: "loop" is removed in python 3.10+
5054 loop.run_until_complete(
5055 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5056
5057 for task in to_cancel:
5058 if task.cancelled():
5059 continue
5060 if task.exception() is not None:
5061 loop.call_exception_handler({
5062 'message': 'unhandled exception during asyncio.run() shutdown',
5063 'exception': task.exception(),
5064 'task': task,
5065 })
5066
5067
5068 def merge_headers(*dicts):
5069 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5070 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5071
5072
5073 def cached_method(f):
5074 """Cache a method"""
5075 signature = inspect.signature(f)
5076
5077 @functools.wraps(f)
5078 def wrapper(self, *args, **kwargs):
5079 bound_args = signature.bind(self, *args, **kwargs)
5080 bound_args.apply_defaults()
5081 key = tuple(bound_args.arguments.values())[1:]
5082
5083 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5084 if key not in cache:
5085 cache[key] = f(self, *args, **kwargs)
5086 return cache[key]
5087 return wrapper
5088
5089
5090 class classproperty:
5091 """property access for class methods with optional caching"""
5092 def __new__(cls, func=None, *args, **kwargs):
5093 if not func:
5094 return functools.partial(cls, *args, **kwargs)
5095 return super().__new__(cls)
5096
5097 def __init__(self, func, *, cache=False):
5098 functools.update_wrapper(self, func)
5099 self.func = func
5100 self._cache = {} if cache else None
5101
5102 def __get__(self, _, cls):
5103 if self._cache is None:
5104 return self.func(cls)
5105 elif cls not in self._cache:
5106 self._cache[cls] = self.func(cls)
5107 return self._cache[cls]
5108
5109
5110 class function_with_repr:
5111 def __init__(self, func, repr_=None):
5112 functools.update_wrapper(self, func)
5113 self.func, self.__repr = func, repr_
5114
5115 def __call__(self, *args, **kwargs):
5116 return self.func(*args, **kwargs)
5117
5118 def __repr__(self):
5119 if self.__repr:
5120 return self.__repr
5121 return f'{self.func.__module__}.{self.func.__qualname__}'
5122
5123
5124 class Namespace(types.SimpleNamespace):
5125 """Immutable namespace"""
5126
5127 def __iter__(self):
5128 return iter(self.__dict__.values())
5129
5130 @property
5131 def items_(self):
5132 return self.__dict__.items()
5133
5134
5135 MEDIA_EXTENSIONS = Namespace(
5136 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5137 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5138 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5139 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5140 thumbnails=('jpg', 'png', 'webp'),
5141 storyboards=('mhtml', ),
5142 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5143 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5144 )
5145 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5146 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5147
5148 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5149
5150
5151 class RetryManager:
5152 """Usage:
5153 for retry in RetryManager(...):
5154 try:
5155 ...
5156 except SomeException as err:
5157 retry.error = err
5158 continue
5159 """
5160 attempt, _error = 0, None
5161
5162 def __init__(self, _retries, _error_callback, **kwargs):
5163 self.retries = _retries or 0
5164 self.error_callback = functools.partial(_error_callback, **kwargs)
5165
5166 def _should_retry(self):
5167 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5168
5169 @property
5170 def error(self):
5171 if self._error is NO_DEFAULT:
5172 return None
5173 return self._error
5174
5175 @error.setter
5176 def error(self, value):
5177 self._error = value
5178
5179 def __iter__(self):
5180 while self._should_retry():
5181 self.error = NO_DEFAULT
5182 self.attempt += 1
5183 yield self
5184 if self.error:
5185 self.error_callback(self.error, self.attempt, self.retries)
5186
5187 @staticmethod
5188 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5189 """Utility function for reporting retries"""
5190 if count > retries:
5191 if error:
5192 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5193 raise e
5194
5195 if not count:
5196 return warn(e)
5197 elif isinstance(e, ExtractorError):
5198 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5199 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5200
5201 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5202 if delay:
5203 info(f'Sleeping {delay:.2f} seconds ...')
5204 time.sleep(delay)
5205
5206
5207 def make_archive_id(ie, video_id):
5208 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5209 return f'{ie_key.lower()} {video_id}'
5210
5211
5212 def truncate_string(s, left, right=0):
5213 assert left > 3 and right >= 0
5214 if s is None or len(s) <= left + right:
5215 return s
5216 return f'{s[:left-3]}...{s[-right:] if right else ""}'
5217
5218
5219 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5220 assert 'all' in alias_dict, '"all" alias is required'
5221 requested = list(start or [])
5222 for val in options:
5223 discard = val.startswith('-')
5224 if discard:
5225 val = val[1:]
5226
5227 if val in alias_dict:
5228 val = alias_dict[val] if not discard else [
5229 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5230 # NB: Do not allow regex in aliases for performance
5231 requested = orderedSet_from_options(val, alias_dict, start=requested)
5232 continue
5233
5234 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5235 else [val] if val in alias_dict['all'] else None)
5236 if current is None:
5237 raise ValueError(val)
5238
5239 if discard:
5240 for item in current:
5241 while item in requested:
5242 requested.remove(item)
5243 else:
5244 requested.extend(current)
5245
5246 return orderedSet(requested)
5247
5248
5249 # TODO: Rewrite
5250 class FormatSorter:
5251 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5252
5253 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5254 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5255 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5256 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5257 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5258 'fps', 'fs_approx', 'source', 'id')
5259
5260 settings = {
5261 'vcodec': {'type': 'ordered', 'regex': True,
5262 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5263 'acodec': {'type': 'ordered', 'regex': True,
5264 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5265 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5266 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5267 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5268 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5269 'vext': {'type': 'ordered', 'field': 'video_ext',
5270 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5271 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5272 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5273 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5274 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5275 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5276 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5277 'field': ('vcodec', 'acodec'),
5278 'function': lambda it: int(any(v != 'none' for v in it))},
5279 'ie_pref': {'priority': True, 'type': 'extractor'},
5280 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5281 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5282 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5283 'quality': {'convert': 'float', 'default': -1},
5284 'filesize': {'convert': 'bytes'},
5285 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5286 'id': {'convert': 'string', 'field': 'format_id'},
5287 'height': {'convert': 'float_none'},
5288 'width': {'convert': 'float_none'},
5289 'fps': {'convert': 'float_none'},
5290 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5291 'tbr': {'convert': 'float_none'},
5292 'vbr': {'convert': 'float_none'},
5293 'abr': {'convert': 'float_none'},
5294 'asr': {'convert': 'float_none'},
5295 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5296
5297 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5298 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5299 'function': lambda it: next(filter(None, it), None)},
5300 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5301 'function': lambda it: next(filter(None, it), None)},
5302 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5303 'res': {'type': 'multiple', 'field': ('height', 'width'),
5304 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5305
5306 # Actual field names
5307 'format_id': {'type': 'alias', 'field': 'id'},
5308 'preference': {'type': 'alias', 'field': 'ie_pref'},
5309 'language_preference': {'type': 'alias', 'field': 'lang'},
5310 'source_preference': {'type': 'alias', 'field': 'source'},
5311 'protocol': {'type': 'alias', 'field': 'proto'},
5312 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5313 'audio_channels': {'type': 'alias', 'field': 'channels'},
5314
5315 # Deprecated
5316 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5317 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5318 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5319 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5320 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5321 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5322 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5323 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5324 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5325 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5326 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5327 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5328 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5329 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5330 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5331 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5332 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5333 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5334 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5335 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5336 }
5337
5338 def __init__(self, ydl, field_preference):
5339 self.ydl = ydl
5340 self._order = []
5341 self.evaluate_params(self.ydl.params, field_preference)
5342 if ydl.params.get('verbose'):
5343 self.print_verbose_info(self.ydl.write_debug)
5344
5345 def _get_field_setting(self, field, key):
5346 if field not in self.settings:
5347 if key in ('forced', 'priority'):
5348 return False
5349 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5350 'deprecated and may be removed in a future version')
5351 self.settings[field] = {}
5352 propObj = self.settings[field]
5353 if key not in propObj:
5354 type = propObj.get('type')
5355 if key == 'field':
5356 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5357 elif key == 'convert':
5358 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5359 else:
5360 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5361 propObj[key] = default
5362 return propObj[key]
5363
5364 def _resolve_field_value(self, field, value, convertNone=False):
5365 if value is None:
5366 if not convertNone:
5367 return None
5368 else:
5369 value = value.lower()
5370 conversion = self._get_field_setting(field, 'convert')
5371 if conversion == 'ignore':
5372 return None
5373 if conversion == 'string':
5374 return value
5375 elif conversion == 'float_none':
5376 return float_or_none(value)
5377 elif conversion == 'bytes':
5378 return parse_bytes(value)
5379 elif conversion == 'order':
5380 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5381 use_regex = self._get_field_setting(field, 'regex')
5382 list_length = len(order_list)
5383 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5384 if use_regex and value is not None:
5385 for i, regex in enumerate(order_list):
5386 if regex and re.match(regex, value):
5387 return list_length - i
5388 return list_length - empty_pos # not in list
5389 else: # not regex or value = None
5390 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5391 else:
5392 if value.isnumeric():
5393 return float(value)
5394 else:
5395 self.settings[field]['convert'] = 'string'
5396 return value
5397
5398 def evaluate_params(self, params, sort_extractor):
5399 self._use_free_order = params.get('prefer_free_formats', False)
5400 self._sort_user = params.get('format_sort', [])
5401 self._sort_extractor = sort_extractor
5402
5403 def add_item(field, reverse, closest, limit_text):
5404 field = field.lower()
5405 if field in self._order:
5406 return
5407 self._order.append(field)
5408 limit = self._resolve_field_value(field, limit_text)
5409 data = {
5410 'reverse': reverse,
5411 'closest': False if limit is None else closest,
5412 'limit_text': limit_text,
5413 'limit': limit}
5414 if field in self.settings:
5415 self.settings[field].update(data)
5416 else:
5417 self.settings[field] = data
5418
5419 sort_list = (
5420 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5421 + (tuple() if params.get('format_sort_force', False)
5422 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5423 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5424
5425 for item in sort_list:
5426 match = re.match(self.regex, item)
5427 if match is None:
5428 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5429 field = match.group('field')
5430 if field is None:
5431 continue
5432 if self._get_field_setting(field, 'type') == 'alias':
5433 alias, field = field, self._get_field_setting(field, 'field')
5434 if self._get_field_setting(alias, 'deprecated'):
5435 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5436 f'be removed in a future version. Please use {field} instead')
5437 reverse = match.group('reverse') is not None
5438 closest = match.group('separator') == '~'
5439 limit_text = match.group('limit')
5440
5441 has_limit = limit_text is not None
5442 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5443 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5444
5445 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5446 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5447 limit_count = len(limits)
5448 for (i, f) in enumerate(fields):
5449 add_item(f, reverse, closest,
5450 limits[i] if i < limit_count
5451 else limits[0] if has_limit and not has_multiple_limits
5452 else None)
5453
5454 def print_verbose_info(self, write_debug):
5455 if self._sort_user:
5456 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5457 if self._sort_extractor:
5458 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5459 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5460 '+' if self._get_field_setting(field, 'reverse') else '', field,
5461 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5462 self._get_field_setting(field, 'limit_text'),
5463 self._get_field_setting(field, 'limit'))
5464 if self._get_field_setting(field, 'limit_text') is not None else '')
5465 for field in self._order if self._get_field_setting(field, 'visible')]))
5466
5467 def _calculate_field_preference_from_value(self, format, field, type, value):
5468 reverse = self._get_field_setting(field, 'reverse')
5469 closest = self._get_field_setting(field, 'closest')
5470 limit = self._get_field_setting(field, 'limit')
5471
5472 if type == 'extractor':
5473 maximum = self._get_field_setting(field, 'max')
5474 if value is None or (maximum is not None and value >= maximum):
5475 value = -1
5476 elif type == 'boolean':
5477 in_list = self._get_field_setting(field, 'in_list')
5478 not_in_list = self._get_field_setting(field, 'not_in_list')
5479 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5480 elif type == 'ordered':
5481 value = self._resolve_field_value(field, value, True)
5482
5483 # try to convert to number
5484 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5485 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5486 if is_num:
5487 value = val_num
5488
5489 return ((-10, 0) if value is None
5490 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5491 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5492 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5493 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5494 else (-1, value, 0))
5495
5496 def _calculate_field_preference(self, format, field):
5497 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5498 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5499 if type == 'multiple':
5500 type = 'field' # Only 'field' is allowed in multiple for now
5501 actual_fields = self._get_field_setting(field, 'field')
5502
5503 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5504 else:
5505 value = get_value(field)
5506 return self._calculate_field_preference_from_value(format, field, type, value)
5507
5508 def calculate_preference(self, format):
5509 # Determine missing protocol
5510 if not format.get('protocol'):
5511 format['protocol'] = determine_protocol(format)
5512
5513 # Determine missing ext
5514 if not format.get('ext') and 'url' in format:
5515 format['ext'] = determine_ext(format['url'])
5516 if format.get('vcodec') == 'none':
5517 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5518 format['video_ext'] = 'none'
5519 else:
5520 format['video_ext'] = format['ext']
5521 format['audio_ext'] = 'none'
5522 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5523 # format['preference'] = -1000
5524
5525 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5526 # HEVC-over-FLV is out-of-spec by FLV's original spec
5527 # ref. https://trac.ffmpeg.org/ticket/6389
5528 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5529 format['preference'] = -100
5530
5531 # Determine missing bitrates
5532 if format.get('vcodec') == 'none':
5533 format['vbr'] = 0
5534 if format.get('acodec') == 'none':
5535 format['abr'] = 0
5536 if not format.get('vbr') and format.get('vcodec') != 'none':
5537 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5538 if not format.get('abr') and format.get('acodec') != 'none':
5539 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5540 if not format.get('tbr'):
5541 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5542
5543 return tuple(self._calculate_field_preference(format, field) for field in self._order)
5544
5545
5546 # XXX: Temporary
5547 class _YDLLogger:
5548 def __init__(self, ydl=None):
5549 self._ydl = ydl
5550
5551 def debug(self, message):
5552 if self._ydl:
5553 self._ydl.write_debug(message)
5554
5555 def info(self, message):
5556 if self._ydl:
5557 self._ydl.to_screen(message)
5558
5559 def warning(self, message, *, once=False):
5560 if self._ydl:
5561 self._ydl.report_warning(message, only_once=once)
5562
5563 def error(self, message, *, is_error=True):
5564 if self._ydl:
5565 self._ydl.report_error(message, is_error=is_error)
5566
5567 def stdout(self, message):
5568 if self._ydl:
5569 self._ydl.to_stdout(message)
5570
5571 def stderr(self, message):
5572 if self._ydl:
5573 self._ydl.to_stderr(message)