]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils/_utils.py
[ie/soundcloud] Adjust format sorting (#9584)
[yt-dlp.git] / yt_dlp / utils / _utils.py
1 import base64
2 import binascii
3 import calendar
4 import codecs
5 import collections
6 import collections.abc
7 import contextlib
8 import datetime
9 import email.header
10 import email.utils
11 import errno
12 import hashlib
13 import hmac
14 import html.entities
15 import html.parser
16 import inspect
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import mimetypes
23 import netrc
24 import operator
25 import os
26 import platform
27 import random
28 import re
29 import shlex
30 import socket
31 import ssl
32 import struct
33 import subprocess
34 import sys
35 import tempfile
36 import time
37 import traceback
38 import types
39 import unicodedata
40 import urllib.error
41 import urllib.parse
42 import urllib.request
43 import xml.etree.ElementTree
44
45 from . import traversal
46
47 from ..compat import functools # isort: split
48 from ..compat import (
49 compat_etree_fromstring,
50 compat_expanduser,
51 compat_HTMLParseError,
52 compat_os_name,
53 compat_shlex_quote,
54 )
55 from ..dependencies import xattr
56
57 __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
58
59 # This is not clearly defined otherwise
60 compiled_regex_type = type(re.compile(''))
61
62
63 class NO_DEFAULT:
64 pass
65
66
67 def IDENTITY(x):
68 return x
69
70
71 ENGLISH_MONTH_NAMES = [
72 'January', 'February', 'March', 'April', 'May', 'June',
73 'July', 'August', 'September', 'October', 'November', 'December']
74
75 MONTH_NAMES = {
76 'en': ENGLISH_MONTH_NAMES,
77 'fr': [
78 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
79 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
80 # these follow the genitive grammatical case (dopełniacz)
81 # some websites might be using nominative, which will require another month list
82 # https://en.wikibooks.org/wiki/Polish/Noun_cases
83 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
84 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
85 }
86
87 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
88 TIMEZONE_NAMES = {
89 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
90 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
91 'EST': -5, 'EDT': -4, # Eastern
92 'CST': -6, 'CDT': -5, # Central
93 'MST': -7, 'MDT': -6, # Mountain
94 'PST': -8, 'PDT': -7 # Pacific
95 }
96
97 # needed for sanitizing filenames in restricted mode
98 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
99 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
100 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
101
102 DATE_FORMATS = (
103 '%d %B %Y',
104 '%d %b %Y',
105 '%B %d %Y',
106 '%B %dst %Y',
107 '%B %dnd %Y',
108 '%B %drd %Y',
109 '%B %dth %Y',
110 '%b %d %Y',
111 '%b %dst %Y',
112 '%b %dnd %Y',
113 '%b %drd %Y',
114 '%b %dth %Y',
115 '%b %dst %Y %I:%M',
116 '%b %dnd %Y %I:%M',
117 '%b %drd %Y %I:%M',
118 '%b %dth %Y %I:%M',
119 '%Y %m %d',
120 '%Y-%m-%d',
121 '%Y.%m.%d.',
122 '%Y/%m/%d',
123 '%Y/%m/%d %H:%M',
124 '%Y/%m/%d %H:%M:%S',
125 '%Y%m%d%H%M',
126 '%Y%m%d%H%M%S',
127 '%Y%m%d',
128 '%Y-%m-%d %H:%M',
129 '%Y-%m-%d %H:%M:%S',
130 '%Y-%m-%d %H:%M:%S.%f',
131 '%Y-%m-%d %H:%M:%S:%f',
132 '%d.%m.%Y %H:%M',
133 '%d.%m.%Y %H.%M',
134 '%Y-%m-%dT%H:%M:%SZ',
135 '%Y-%m-%dT%H:%M:%S.%fZ',
136 '%Y-%m-%dT%H:%M:%S.%f0Z',
137 '%Y-%m-%dT%H:%M:%S',
138 '%Y-%m-%dT%H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M',
140 '%b %d %Y at %H:%M',
141 '%b %d %Y at %H:%M:%S',
142 '%B %d %Y at %H:%M',
143 '%B %d %Y at %H:%M:%S',
144 '%H:%M %d-%b-%Y',
145 )
146
147 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
148 DATE_FORMATS_DAY_FIRST.extend([
149 '%d-%m-%Y',
150 '%d.%m.%Y',
151 '%d.%m.%y',
152 '%d/%m/%Y',
153 '%d/%m/%y',
154 '%d/%m/%Y %H:%M:%S',
155 '%d-%m-%Y %H:%M',
156 '%H:%M %d/%m/%Y',
157 ])
158
159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160 DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166 ])
167
168 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
169 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
170
171 NUMBER_RE = r'\d+(?:\.\d+)?'
172
173
174 @functools.cache
175 def preferredencoding():
176 """Get preferred encoding.
177
178 Returns the best encoding scheme for the system, based on
179 locale.getpreferredencoding() and some further tweaks.
180 """
181 try:
182 pref = locale.getpreferredencoding()
183 'TEST'.encode(pref)
184 except Exception:
185 pref = 'UTF-8'
186
187 return pref
188
189
190 def write_json_file(obj, fn):
191 """ Encode obj as JSON and write it to fn, atomically if possible """
192
193 tf = tempfile.NamedTemporaryFile(
194 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
195 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
196
197 try:
198 with tf:
199 json.dump(obj, tf, ensure_ascii=False)
200 if sys.platform == 'win32':
201 # Need to remove existing file on Windows, else os.rename raises
202 # WindowsError or FileExistsError.
203 with contextlib.suppress(OSError):
204 os.unlink(fn)
205 with contextlib.suppress(OSError):
206 mask = os.umask(0)
207 os.umask(mask)
208 os.chmod(tf.name, 0o666 & ~mask)
209 os.rename(tf.name, fn)
210 except Exception:
211 with contextlib.suppress(OSError):
212 os.remove(tf.name)
213 raise
214
215
216 def find_xpath_attr(node, xpath, key, val=None):
217 """ Find the xpath xpath[@key=val] """
218 assert re.match(r'^[a-zA-Z_-]+$', key)
219 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
220 return node.find(expr)
221
222 # On python2.6 the xml.etree.ElementTree.Element methods don't support
223 # the namespace parameter
224
225
226 def xpath_with_ns(path, ns_map):
227 components = [c.split(':') for c in path.split('/')]
228 replaced = []
229 for c in components:
230 if len(c) == 1:
231 replaced.append(c[0])
232 else:
233 ns, tag = c
234 replaced.append('{%s}%s' % (ns_map[ns], tag))
235 return '/'.join(replaced)
236
237
238 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
239 def _find_xpath(xpath):
240 return node.find(xpath)
241
242 if isinstance(xpath, str):
243 n = _find_xpath(xpath)
244 else:
245 for xp in xpath:
246 n = _find_xpath(xp)
247 if n is not None:
248 break
249
250 if n is None:
251 if default is not NO_DEFAULT:
252 return default
253 elif fatal:
254 name = xpath if name is None else name
255 raise ExtractorError('Could not find XML element %s' % name)
256 else:
257 return None
258 return n
259
260
261 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
262 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
263 if n is None or n == default:
264 return n
265 if n.text is None:
266 if default is not NO_DEFAULT:
267 return default
268 elif fatal:
269 name = xpath if name is None else name
270 raise ExtractorError('Could not find XML element\'s text %s' % name)
271 else:
272 return None
273 return n.text
274
275
276 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
277 n = find_xpath_attr(node, xpath, key)
278 if n is None:
279 if default is not NO_DEFAULT:
280 return default
281 elif fatal:
282 name = f'{xpath}[@{key}]' if name is None else name
283 raise ExtractorError('Could not find XML attribute %s' % name)
284 else:
285 return None
286 return n.attrib[key]
287
288
289 def get_element_by_id(id, html, **kwargs):
290 """Return the content of the tag with the specified ID in the passed HTML document"""
291 return get_element_by_attribute('id', id, html, **kwargs)
292
293
294 def get_element_html_by_id(id, html, **kwargs):
295 """Return the html of the tag with the specified ID in the passed HTML document"""
296 return get_element_html_by_attribute('id', id, html, **kwargs)
297
298
299 def get_element_by_class(class_name, html):
300 """Return the content of the first tag with the specified class in the passed HTML document"""
301 retval = get_elements_by_class(class_name, html)
302 return retval[0] if retval else None
303
304
305 def get_element_html_by_class(class_name, html):
306 """Return the html of the first tag with the specified class in the passed HTML document"""
307 retval = get_elements_html_by_class(class_name, html)
308 return retval[0] if retval else None
309
310
311 def get_element_by_attribute(attribute, value, html, **kwargs):
312 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
313 return retval[0] if retval else None
314
315
316 def get_element_html_by_attribute(attribute, value, html, **kargs):
317 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
318 return retval[0] if retval else None
319
320
321 def get_elements_by_class(class_name, html, **kargs):
322 """Return the content of all tags with the specified class in the passed HTML document as a list"""
323 return get_elements_by_attribute(
324 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
325 html, escape_value=False)
326
327
328 def get_elements_html_by_class(class_name, html):
329 """Return the html of all tags with the specified class in the passed HTML document as a list"""
330 return get_elements_html_by_attribute(
331 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
332 html, escape_value=False)
333
334
335 def get_elements_by_attribute(*args, **kwargs):
336 """Return the content of the tag with the specified attribute in the passed HTML document"""
337 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
338
339
340 def get_elements_html_by_attribute(*args, **kwargs):
341 """Return the html of the tag with the specified attribute in the passed HTML document"""
342 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
343
344
345 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
346 """
347 Return the text (content) and the html (whole) of the tag with the specified
348 attribute in the passed HTML document
349 """
350 if not value:
351 return
352
353 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
354
355 value = re.escape(value) if escape_value else value
356
357 partial_element_re = rf'''(?x)
358 <(?P<tag>{tag})
359 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
360 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
361 '''
362
363 for m in re.finditer(partial_element_re, html):
364 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
365
366 yield (
367 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
368 whole
369 )
370
371
372 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
373 """
374 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
375 closing tag for the first opening tag it has encountered, and can be used
376 as a context manager
377 """
378
379 class HTMLBreakOnClosingTagException(Exception):
380 pass
381
382 def __init__(self):
383 self.tagstack = collections.deque()
384 html.parser.HTMLParser.__init__(self)
385
386 def __enter__(self):
387 return self
388
389 def __exit__(self, *_):
390 self.close()
391
392 def close(self):
393 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
394 # so data remains buffered; we no longer have any interest in it, thus
395 # override this method to discard it
396 pass
397
398 def handle_starttag(self, tag, _):
399 self.tagstack.append(tag)
400
401 def handle_endtag(self, tag):
402 if not self.tagstack:
403 raise compat_HTMLParseError('no tags in the stack')
404 while self.tagstack:
405 inner_tag = self.tagstack.pop()
406 if inner_tag == tag:
407 break
408 else:
409 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
410 if not self.tagstack:
411 raise self.HTMLBreakOnClosingTagException()
412
413
414 # XXX: This should be far less strict
415 def get_element_text_and_html_by_tag(tag, html):
416 """
417 For the first element with the specified tag in the passed HTML document
418 return its' content (text) and the whole element (html)
419 """
420 def find_or_raise(haystack, needle, exc):
421 try:
422 return haystack.index(needle)
423 except ValueError:
424 raise exc
425 closing_tag = f'</{tag}>'
426 whole_start = find_or_raise(
427 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
428 content_start = find_or_raise(
429 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
430 content_start += whole_start + 1
431 with HTMLBreakOnClosingTagParser() as parser:
432 parser.feed(html[whole_start:content_start])
433 if not parser.tagstack or parser.tagstack[0] != tag:
434 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
435 offset = content_start
436 while offset < len(html):
437 next_closing_tag_start = find_or_raise(
438 html[offset:], closing_tag,
439 compat_HTMLParseError(f'closing {tag} tag not found'))
440 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
441 try:
442 parser.feed(html[offset:offset + next_closing_tag_end])
443 offset += next_closing_tag_end
444 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
445 return html[content_start:offset + next_closing_tag_start], \
446 html[whole_start:offset + next_closing_tag_end]
447 raise compat_HTMLParseError('unexpected end of html')
448
449
450 class HTMLAttributeParser(html.parser.HTMLParser):
451 """Trivial HTML parser to gather the attributes for a single element"""
452
453 def __init__(self):
454 self.attrs = {}
455 html.parser.HTMLParser.__init__(self)
456
457 def handle_starttag(self, tag, attrs):
458 self.attrs = dict(attrs)
459 raise compat_HTMLParseError('done')
460
461
462 class HTMLListAttrsParser(html.parser.HTMLParser):
463 """HTML parser to gather the attributes for the elements of a list"""
464
465 def __init__(self):
466 html.parser.HTMLParser.__init__(self)
467 self.items = []
468 self._level = 0
469
470 def handle_starttag(self, tag, attrs):
471 if tag == 'li' and self._level == 0:
472 self.items.append(dict(attrs))
473 self._level += 1
474
475 def handle_endtag(self, tag):
476 self._level -= 1
477
478
479 def extract_attributes(html_element):
480 """Given a string for an HTML element such as
481 <el
482 a="foo" B="bar" c="&98;az" d=boz
483 empty= noval entity="&amp;"
484 sq='"' dq="'"
485 >
486 Decode and return a dictionary of attributes.
487 {
488 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
489 'empty': '', 'noval': None, 'entity': '&',
490 'sq': '"', 'dq': '\''
491 }.
492 """
493 parser = HTMLAttributeParser()
494 with contextlib.suppress(compat_HTMLParseError):
495 parser.feed(html_element)
496 parser.close()
497 return parser.attrs
498
499
500 def parse_list(webpage):
501 """Given a string for an series of HTML <li> elements,
502 return a dictionary of their attributes"""
503 parser = HTMLListAttrsParser()
504 parser.feed(webpage)
505 parser.close()
506 return parser.items
507
508
509 def clean_html(html):
510 """Clean an HTML snippet into a readable string"""
511
512 if html is None: # Convenience for sanitizing descriptions etc.
513 return html
514
515 html = re.sub(r'\s+', ' ', html)
516 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
517 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
518 # Strip html tags
519 html = re.sub('<.*?>', '', html)
520 # Replace html entities
521 html = unescapeHTML(html)
522 return html.strip()
523
524
525 class LenientJSONDecoder(json.JSONDecoder):
526 # TODO: Write tests
527 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
528 self.transform_source, self.ignore_extra = transform_source, ignore_extra
529 self._close_attempts = 2 * close_objects
530 super().__init__(*args, **kwargs)
531
532 @staticmethod
533 def _close_object(err):
534 doc = err.doc[:err.pos]
535 # We need to add comma first to get the correct error message
536 if err.msg.startswith('Expecting \',\''):
537 return doc + ','
538 elif not doc.endswith(','):
539 return
540
541 if err.msg.startswith('Expecting property name'):
542 return doc[:-1] + '}'
543 elif err.msg.startswith('Expecting value'):
544 return doc[:-1] + ']'
545
546 def decode(self, s):
547 if self.transform_source:
548 s = self.transform_source(s)
549 for attempt in range(self._close_attempts + 1):
550 try:
551 if self.ignore_extra:
552 return self.raw_decode(s.lstrip())[0]
553 return super().decode(s)
554 except json.JSONDecodeError as e:
555 if e.pos is None:
556 raise
557 elif attempt < self._close_attempts:
558 s = self._close_object(e)
559 if s is not None:
560 continue
561 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
562 assert False, 'Too many attempts to decode JSON'
563
564
565 def sanitize_open(filename, open_mode):
566 """Try to open the given filename, and slightly tweak it if this fails.
567
568 Attempts to open the given filename. If this fails, it tries to change
569 the filename slightly, step by step, until it's either able to open it
570 or it fails and raises a final exception, like the standard open()
571 function.
572
573 It returns the tuple (stream, definitive_file_name).
574 """
575 if filename == '-':
576 if sys.platform == 'win32':
577 import msvcrt
578
579 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
580 with contextlib.suppress(io.UnsupportedOperation):
581 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
582 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
583
584 for attempt in range(2):
585 try:
586 try:
587 if sys.platform == 'win32':
588 # FIXME: An exclusive lock also locks the file from being read.
589 # Since windows locks are mandatory, don't lock the file on windows (for now).
590 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
591 raise LockingUnsupportedError()
592 stream = locked_file(filename, open_mode, block=False).__enter__()
593 except OSError:
594 stream = open(filename, open_mode)
595 return stream, filename
596 except OSError as err:
597 if attempt or err.errno in (errno.EACCES,):
598 raise
599 old_filename, filename = filename, sanitize_path(filename)
600 if old_filename == filename:
601 raise
602
603
604 def timeconvert(timestr):
605 """Convert RFC 2822 defined time string into system timestamp"""
606 timestamp = None
607 timetuple = email.utils.parsedate_tz(timestr)
608 if timetuple is not None:
609 timestamp = email.utils.mktime_tz(timetuple)
610 return timestamp
611
612
613 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
614 """Sanitizes a string so it could be used as part of a filename.
615 @param restricted Use a stricter subset of allowed characters
616 @param is_id Whether this is an ID that should be kept unchanged if possible.
617 If unset, yt-dlp's new sanitization rules are in effect
618 """
619 if s == '':
620 return ''
621
622 def replace_insane(char):
623 if restricted and char in ACCENT_CHARS:
624 return ACCENT_CHARS[char]
625 elif not restricted and char == '\n':
626 return '\0 '
627 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
628 # Replace with their full-width unicode counterparts
629 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
630 elif char == '?' or ord(char) < 32 or ord(char) == 127:
631 return ''
632 elif char == '"':
633 return '' if restricted else '\''
634 elif char == ':':
635 return '\0_\0-' if restricted else '\0 \0-'
636 elif char in '\\/|*<>':
637 return '\0_'
638 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
639 return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
640 return char
641
642 # Replace look-alike Unicode glyphs
643 if restricted and (is_id is NO_DEFAULT or not is_id):
644 s = unicodedata.normalize('NFKC', s)
645 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
646 result = ''.join(map(replace_insane, s))
647 if is_id is NO_DEFAULT:
648 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
649 STRIP_RE = r'(?:\0.|[ _-])*'
650 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
651 result = result.replace('\0', '') or '_'
652
653 if not is_id:
654 while '__' in result:
655 result = result.replace('__', '_')
656 result = result.strip('_')
657 # Common case of "Foreign band name - English song title"
658 if restricted and result.startswith('-_'):
659 result = result[2:]
660 if result.startswith('-'):
661 result = '_' + result[len('-'):]
662 result = result.lstrip('.')
663 if not result:
664 result = '_'
665 return result
666
667
668 def sanitize_path(s, force=False):
669 """Sanitizes and normalizes path on Windows"""
670 # XXX: this handles drive relative paths (c:sth) incorrectly
671 if sys.platform == 'win32':
672 force = False
673 drive_or_unc, _ = os.path.splitdrive(s)
674 elif force:
675 drive_or_unc = ''
676 else:
677 return s
678
679 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
680 if drive_or_unc:
681 norm_path.pop(0)
682 sanitized_path = [
683 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
684 for path_part in norm_path]
685 if drive_or_unc:
686 sanitized_path.insert(0, drive_or_unc + os.path.sep)
687 elif force and s and s[0] == os.path.sep:
688 sanitized_path.insert(0, os.path.sep)
689 # TODO: Fix behavioral differences <3.12
690 # The workaround using `normpath` only superficially passes tests
691 # Ref: https://github.com/python/cpython/pull/100351
692 return os.path.normpath(os.path.join(*sanitized_path))
693
694
695 def sanitize_url(url, *, scheme='http'):
696 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
697 # the number of unwanted failures due to missing protocol
698 if url is None:
699 return
700 elif url.startswith('//'):
701 return f'{scheme}:{url}'
702 # Fix some common typos seen so far
703 COMMON_TYPOS = (
704 # https://github.com/ytdl-org/youtube-dl/issues/15649
705 (r'^httpss://', r'https://'),
706 # https://bx1.be/lives/direct-tv/
707 (r'^rmtp([es]?)://', r'rtmp\1://'),
708 )
709 for mistake, fixup in COMMON_TYPOS:
710 if re.match(mistake, url):
711 return re.sub(mistake, fixup, url)
712 return url
713
714
715 def extract_basic_auth(url):
716 parts = urllib.parse.urlsplit(url)
717 if parts.username is None:
718 return url, None
719 url = urllib.parse.urlunsplit(parts._replace(netloc=(
720 parts.hostname if parts.port is None
721 else '%s:%d' % (parts.hostname, parts.port))))
722 auth_payload = base64.b64encode(
723 ('%s:%s' % (parts.username, parts.password or '')).encode())
724 return url, f'Basic {auth_payload.decode()}'
725
726
727 def expand_path(s):
728 """Expand shell variables and ~"""
729 return os.path.expandvars(compat_expanduser(s))
730
731
732 def orderedSet(iterable, *, lazy=False):
733 """Remove all duplicates from the input iterable"""
734 def _iter():
735 seen = [] # Do not use set since the items can be unhashable
736 for x in iterable:
737 if x not in seen:
738 seen.append(x)
739 yield x
740
741 return _iter() if lazy else list(_iter())
742
743
744 def _htmlentity_transform(entity_with_semicolon):
745 """Transforms an HTML entity to a character."""
746 entity = entity_with_semicolon[:-1]
747
748 # Known non-numeric HTML entity
749 if entity in html.entities.name2codepoint:
750 return chr(html.entities.name2codepoint[entity])
751
752 # TODO: HTML5 allows entities without a semicolon.
753 # E.g. '&Eacuteric' should be decoded as 'Éric'.
754 if entity_with_semicolon in html.entities.html5:
755 return html.entities.html5[entity_with_semicolon]
756
757 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
758 if mobj is not None:
759 numstr = mobj.group(1)
760 if numstr.startswith('x'):
761 base = 16
762 numstr = '0%s' % numstr
763 else:
764 base = 10
765 # See https://github.com/ytdl-org/youtube-dl/issues/7518
766 with contextlib.suppress(ValueError):
767 return chr(int(numstr, base))
768
769 # Unknown entity in name, return its literal representation
770 return '&%s;' % entity
771
772
773 def unescapeHTML(s):
774 if s is None:
775 return None
776 assert isinstance(s, str)
777
778 return re.sub(
779 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
780
781
782 def escapeHTML(text):
783 return (
784 text
785 .replace('&', '&amp;')
786 .replace('<', '&lt;')
787 .replace('>', '&gt;')
788 .replace('"', '&quot;')
789 .replace("'", '&#39;')
790 )
791
792
793 class netrc_from_content(netrc.netrc):
794 def __init__(self, content):
795 self.hosts, self.macros = {}, {}
796 with io.StringIO(content) as stream:
797 self._parse('-', stream, False)
798
799
800 class Popen(subprocess.Popen):
801 if sys.platform == 'win32':
802 _startupinfo = subprocess.STARTUPINFO()
803 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
804 else:
805 _startupinfo = None
806
807 @staticmethod
808 def _fix_pyinstaller_ld_path(env):
809 """Restore LD_LIBRARY_PATH when using PyInstaller
810 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
811 https://github.com/yt-dlp/yt-dlp/issues/4573
812 """
813 if not hasattr(sys, '_MEIPASS'):
814 return
815
816 def _fix(key):
817 orig = env.get(f'{key}_ORIG')
818 if orig is None:
819 env.pop(key, None)
820 else:
821 env[key] = orig
822
823 _fix('LD_LIBRARY_PATH') # Linux
824 _fix('DYLD_LIBRARY_PATH') # macOS
825
826 def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
827 if env is None:
828 env = os.environ.copy()
829 self._fix_pyinstaller_ld_path(env)
830
831 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
832 if text is True:
833 kwargs['universal_newlines'] = True # For 3.6 compatibility
834 kwargs.setdefault('encoding', 'utf-8')
835 kwargs.setdefault('errors', 'replace')
836
837 if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
838 if not isinstance(args, str):
839 args = ' '.join(compat_shlex_quote(a) for a in args)
840 shell = False
841 args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
842
843 super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
844
845 def __comspec(self):
846 comspec = os.environ.get('ComSpec') or os.path.join(
847 os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
848 if os.path.isabs(comspec):
849 return comspec
850 raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
851
852 def communicate_or_kill(self, *args, **kwargs):
853 try:
854 return self.communicate(*args, **kwargs)
855 except BaseException: # Including KeyboardInterrupt
856 self.kill(timeout=None)
857 raise
858
859 def kill(self, *, timeout=0):
860 super().kill()
861 if timeout != 0:
862 self.wait(timeout=timeout)
863
864 @classmethod
865 def run(cls, *args, timeout=None, **kwargs):
866 with cls(*args, **kwargs) as proc:
867 default = '' if proc.__text_mode else b''
868 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
869 return stdout or default, stderr or default, proc.returncode
870
871
872 def encodeArgument(s):
873 # Legacy code that uses byte strings
874 # Uncomment the following line after fixing all post processors
875 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
876 return s if isinstance(s, str) else s.decode('ascii')
877
878
879 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
880
881
882 def timetuple_from_msec(msec):
883 secs, msec = divmod(msec, 1000)
884 mins, secs = divmod(secs, 60)
885 hrs, mins = divmod(mins, 60)
886 return _timetuple(hrs, mins, secs, msec)
887
888
889 def formatSeconds(secs, delim=':', msec=False):
890 time = timetuple_from_msec(secs * 1000)
891 if time.hours:
892 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
893 elif time.minutes:
894 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
895 else:
896 ret = '%d' % time.seconds
897 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
898
899
900 def bug_reports_message(before=';'):
901 from ..update import REPOSITORY
902
903 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
904 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
905
906 before = before.rstrip()
907 if not before or before.endswith(('.', '!', '?')):
908 msg = msg[0].title() + msg[1:]
909
910 return (before + ' ' if before else '') + msg
911
912
913 class YoutubeDLError(Exception):
914 """Base exception for YoutubeDL errors."""
915 msg = None
916
917 def __init__(self, msg=None):
918 if msg is not None:
919 self.msg = msg
920 elif self.msg is None:
921 self.msg = type(self).__name__
922 super().__init__(self.msg)
923
924
925 class ExtractorError(YoutubeDLError):
926 """Error during info extraction."""
927
928 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
929 """ tb, if given, is the original traceback (so that it can be printed out).
930 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
931 """
932 from ..networking.exceptions import network_exceptions
933 if sys.exc_info()[0] in network_exceptions:
934 expected = True
935
936 self.orig_msg = str(msg)
937 self.traceback = tb
938 self.expected = expected
939 self.cause = cause
940 self.video_id = video_id
941 self.ie = ie
942 self.exc_info = sys.exc_info() # preserve original exception
943 if isinstance(self.exc_info[1], ExtractorError):
944 self.exc_info = self.exc_info[1].exc_info
945 super().__init__(self.__msg)
946
947 @property
948 def __msg(self):
949 return ''.join((
950 format_field(self.ie, None, '[%s] '),
951 format_field(self.video_id, None, '%s: '),
952 self.orig_msg,
953 format_field(self.cause, None, ' (caused by %r)'),
954 '' if self.expected else bug_reports_message()))
955
956 def format_traceback(self):
957 return join_nonempty(
958 self.traceback and ''.join(traceback.format_tb(self.traceback)),
959 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
960 delim='\n') or None
961
962 def __setattr__(self, name, value):
963 super().__setattr__(name, value)
964 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
965 self.msg = self.__msg or type(self).__name__
966 self.args = (self.msg, ) # Cannot be property
967
968
969 class UnsupportedError(ExtractorError):
970 def __init__(self, url):
971 super().__init__(
972 'Unsupported URL: %s' % url, expected=True)
973 self.url = url
974
975
976 class RegexNotFoundError(ExtractorError):
977 """Error when a regex didn't match"""
978 pass
979
980
981 class GeoRestrictedError(ExtractorError):
982 """Geographic restriction Error exception.
983
984 This exception may be thrown when a video is not available from your
985 geographic location due to geographic restrictions imposed by a website.
986 """
987
988 def __init__(self, msg, countries=None, **kwargs):
989 kwargs['expected'] = True
990 super().__init__(msg, **kwargs)
991 self.countries = countries
992
993
994 class UserNotLive(ExtractorError):
995 """Error when a channel/user is not live"""
996
997 def __init__(self, msg=None, **kwargs):
998 kwargs['expected'] = True
999 super().__init__(msg or 'The channel is not currently live', **kwargs)
1000
1001
1002 class DownloadError(YoutubeDLError):
1003 """Download Error exception.
1004
1005 This exception may be thrown by FileDownloader objects if they are not
1006 configured to continue on errors. They will contain the appropriate
1007 error message.
1008 """
1009
1010 def __init__(self, msg, exc_info=None):
1011 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1012 super().__init__(msg)
1013 self.exc_info = exc_info
1014
1015
1016 class EntryNotInPlaylist(YoutubeDLError):
1017 """Entry not in playlist exception.
1018
1019 This exception will be thrown by YoutubeDL when a requested entry
1020 is not found in the playlist info_dict
1021 """
1022 msg = 'Entry not found in info'
1023
1024
1025 class SameFileError(YoutubeDLError):
1026 """Same File exception.
1027
1028 This exception will be thrown by FileDownloader objects if they detect
1029 multiple files would have to be downloaded to the same file on disk.
1030 """
1031 msg = 'Fixed output name but more than one file to download'
1032
1033 def __init__(self, filename=None):
1034 if filename is not None:
1035 self.msg += f': {filename}'
1036 super().__init__(self.msg)
1037
1038
1039 class PostProcessingError(YoutubeDLError):
1040 """Post Processing exception.
1041
1042 This exception may be raised by PostProcessor's .run() method to
1043 indicate an error in the postprocessing task.
1044 """
1045
1046
1047 class DownloadCancelled(YoutubeDLError):
1048 """ Exception raised when the download queue should be interrupted """
1049 msg = 'The download was cancelled'
1050
1051
1052 class ExistingVideoReached(DownloadCancelled):
1053 """ --break-on-existing triggered """
1054 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1055
1056
1057 class RejectedVideoReached(DownloadCancelled):
1058 """ --break-match-filter triggered """
1059 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1060
1061
1062 class MaxDownloadsReached(DownloadCancelled):
1063 """ --max-downloads limit has been reached. """
1064 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1065
1066
1067 class ReExtractInfo(YoutubeDLError):
1068 """ Video info needs to be re-extracted. """
1069
1070 def __init__(self, msg, expected=False):
1071 super().__init__(msg)
1072 self.expected = expected
1073
1074
1075 class ThrottledDownload(ReExtractInfo):
1076 """ Download speed below --throttled-rate. """
1077 msg = 'The download speed is below throttle limit'
1078
1079 def __init__(self):
1080 super().__init__(self.msg, expected=False)
1081
1082
1083 class UnavailableVideoError(YoutubeDLError):
1084 """Unavailable Format exception.
1085
1086 This exception will be thrown when a video is requested
1087 in a format that is not available for that video.
1088 """
1089 msg = 'Unable to download video'
1090
1091 def __init__(self, err=None):
1092 if err is not None:
1093 self.msg += f': {err}'
1094 super().__init__(self.msg)
1095
1096
1097 class ContentTooShortError(YoutubeDLError):
1098 """Content Too Short exception.
1099
1100 This exception may be raised by FileDownloader objects when a file they
1101 download is too small for what the server announced first, indicating
1102 the connection was probably interrupted.
1103 """
1104
1105 def __init__(self, downloaded, expected):
1106 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1107 # Both in bytes
1108 self.downloaded = downloaded
1109 self.expected = expected
1110
1111
1112 class XAttrMetadataError(YoutubeDLError):
1113 def __init__(self, code=None, msg='Unknown error'):
1114 super().__init__(msg)
1115 self.code = code
1116 self.msg = msg
1117
1118 # Parsing code and msg
1119 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1120 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1121 self.reason = 'NO_SPACE'
1122 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1123 self.reason = 'VALUE_TOO_LONG'
1124 else:
1125 self.reason = 'NOT_SUPPORTED'
1126
1127
1128 class XAttrUnavailableError(YoutubeDLError):
1129 pass
1130
1131
1132 def is_path_like(f):
1133 return isinstance(f, (str, bytes, os.PathLike))
1134
1135
1136 def extract_timezone(date_str):
1137 m = re.search(
1138 r'''(?x)
1139 ^.{8,}? # >=8 char non-TZ prefix, if present
1140 (?P<tz>Z| # just the UTC Z, or
1141 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1142 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1143 [ ]? # optional space
1144 (?P<sign>\+|-) # +/-
1145 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1146 $)
1147 ''', date_str)
1148 if not m:
1149 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1150 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1151 if timezone is not None:
1152 date_str = date_str[:-len(m.group('tz'))]
1153 timezone = datetime.timedelta(hours=timezone or 0)
1154 else:
1155 date_str = date_str[:-len(m.group('tz'))]
1156 if not m.group('sign'):
1157 timezone = datetime.timedelta()
1158 else:
1159 sign = 1 if m.group('sign') == '+' else -1
1160 timezone = datetime.timedelta(
1161 hours=sign * int(m.group('hours')),
1162 minutes=sign * int(m.group('minutes')))
1163 return timezone, date_str
1164
1165
1166 def parse_iso8601(date_str, delimiter='T', timezone=None):
1167 """ Return a UNIX timestamp from the given date """
1168
1169 if date_str is None:
1170 return None
1171
1172 date_str = re.sub(r'\.[0-9]+', '', date_str)
1173
1174 if timezone is None:
1175 timezone, date_str = extract_timezone(date_str)
1176
1177 with contextlib.suppress(ValueError):
1178 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1179 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1180 return calendar.timegm(dt.timetuple())
1181
1182
1183 def date_formats(day_first=True):
1184 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1185
1186
1187 def unified_strdate(date_str, day_first=True):
1188 """Return a string with the date in the format YYYYMMDD"""
1189
1190 if date_str is None:
1191 return None
1192 upload_date = None
1193 # Replace commas
1194 date_str = date_str.replace(',', ' ')
1195 # Remove AM/PM + timezone
1196 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1197 _, date_str = extract_timezone(date_str)
1198
1199 for expression in date_formats(day_first):
1200 with contextlib.suppress(ValueError):
1201 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1202 if upload_date is None:
1203 timetuple = email.utils.parsedate_tz(date_str)
1204 if timetuple:
1205 with contextlib.suppress(ValueError):
1206 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1207 if upload_date is not None:
1208 return str(upload_date)
1209
1210
1211 def unified_timestamp(date_str, day_first=True):
1212 if not isinstance(date_str, str):
1213 return None
1214
1215 date_str = re.sub(r'\s+', ' ', re.sub(
1216 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1217
1218 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1219 timezone, date_str = extract_timezone(date_str)
1220
1221 # Remove AM/PM + timezone
1222 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1223
1224 # Remove unrecognized timezones from ISO 8601 alike timestamps
1225 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1226 if m:
1227 date_str = date_str[:-len(m.group('tz'))]
1228
1229 # Python only supports microseconds, so remove nanoseconds
1230 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1231 if m:
1232 date_str = m.group(1)
1233
1234 for expression in date_formats(day_first):
1235 with contextlib.suppress(ValueError):
1236 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1237 return calendar.timegm(dt.timetuple())
1238
1239 timetuple = email.utils.parsedate_tz(date_str)
1240 if timetuple:
1241 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1242
1243
1244 def determine_ext(url, default_ext='unknown_video'):
1245 if url is None or '.' not in url:
1246 return default_ext
1247 guess = url.partition('?')[0].rpartition('.')[2]
1248 if re.match(r'^[A-Za-z0-9]+$', guess):
1249 return guess
1250 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1251 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1252 return guess.rstrip('/')
1253 else:
1254 return default_ext
1255
1256
1257 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1258 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1259
1260
1261 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1262 R"""
1263 Return a datetime object from a string.
1264 Supported format:
1265 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1266
1267 @param format strftime format of DATE
1268 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1269 auto: round to the unit provided in date_str (if applicable).
1270 """
1271 auto_precision = False
1272 if precision == 'auto':
1273 auto_precision = True
1274 precision = 'microsecond'
1275 today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
1276 if date_str in ('now', 'today'):
1277 return today
1278 if date_str == 'yesterday':
1279 return today - datetime.timedelta(days=1)
1280 match = re.match(
1281 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1282 date_str)
1283 if match is not None:
1284 start_time = datetime_from_str(match.group('start'), precision, format)
1285 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1286 unit = match.group('unit')
1287 if unit == 'month' or unit == 'year':
1288 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1289 unit = 'day'
1290 else:
1291 if unit == 'week':
1292 unit = 'day'
1293 time *= 7
1294 delta = datetime.timedelta(**{unit + 's': time})
1295 new_date = start_time + delta
1296 if auto_precision:
1297 return datetime_round(new_date, unit)
1298 return new_date
1299
1300 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1301
1302
1303 def date_from_str(date_str, format='%Y%m%d', strict=False):
1304 R"""
1305 Return a date object from a string using datetime_from_str
1306
1307 @param strict Restrict allowed patterns to "YYYYMMDD" and
1308 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1309 """
1310 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1311 raise ValueError(f'Invalid date format "{date_str}"')
1312 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1313
1314
1315 def datetime_add_months(dt, months):
1316 """Increment/Decrement a datetime object by months."""
1317 month = dt.month + months - 1
1318 year = dt.year + month // 12
1319 month = month % 12 + 1
1320 day = min(dt.day, calendar.monthrange(year, month)[1])
1321 return dt.replace(year, month, day)
1322
1323
1324 def datetime_round(dt, precision='day'):
1325 """
1326 Round a datetime object's time to a specific precision
1327 """
1328 if precision == 'microsecond':
1329 return dt
1330
1331 unit_seconds = {
1332 'day': 86400,
1333 'hour': 3600,
1334 'minute': 60,
1335 'second': 1,
1336 }
1337 roundto = lambda x, n: ((x + n / 2) // n) * n
1338 timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1339 return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
1340
1341
1342 def hyphenate_date(date_str):
1343 """
1344 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346 if match is not None:
1347 return '-'.join(match.groups())
1348 else:
1349 return date_str
1350
1351
1352 class DateRange:
1353 """Represents a time interval between two dates"""
1354
1355 def __init__(self, start=None, end=None):
1356 """start and end must be strings in the format accepted by date"""
1357 if start is not None:
1358 self.start = date_from_str(start, strict=True)
1359 else:
1360 self.start = datetime.datetime.min.date()
1361 if end is not None:
1362 self.end = date_from_str(end, strict=True)
1363 else:
1364 self.end = datetime.datetime.max.date()
1365 if self.start > self.end:
1366 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1367
1368 @classmethod
1369 def day(cls, day):
1370 """Returns a range that only contains the given day"""
1371 return cls(day, day)
1372
1373 def __contains__(self, date):
1374 """Check if the date is in the range"""
1375 if not isinstance(date, datetime.date):
1376 date = date_from_str(date)
1377 return self.start <= date <= self.end
1378
1379 def __repr__(self):
1380 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1381
1382 def __str__(self):
1383 return f'{self.start} to {self.end}'
1384
1385 def __eq__(self, other):
1386 return (isinstance(other, DateRange)
1387 and self.start == other.start and self.end == other.end)
1388
1389
1390 @functools.cache
1391 def system_identifier():
1392 python_implementation = platform.python_implementation()
1393 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1394 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1395 libc_ver = []
1396 with contextlib.suppress(OSError): # We may not have access to the executable
1397 libc_ver = platform.libc_ver()
1398
1399 return 'Python %s (%s %s %s) - %s (%s%s)' % (
1400 platform.python_version(),
1401 python_implementation,
1402 platform.machine(),
1403 platform.architecture()[0],
1404 platform.platform(),
1405 ssl.OPENSSL_VERSION,
1406 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1407 )
1408
1409
1410 @functools.cache
1411 def get_windows_version():
1412 ''' Get Windows version. returns () if it's not running on Windows '''
1413 if compat_os_name == 'nt':
1414 return version_tuple(platform.win32_ver()[1])
1415 else:
1416 return ()
1417
1418
1419 def write_string(s, out=None, encoding=None):
1420 assert isinstance(s, str)
1421 out = out or sys.stderr
1422 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1423 if not out:
1424 return
1425
1426 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1427 s = re.sub(r'([\r\n]+)', r' \1', s)
1428
1429 enc, buffer = None, out
1430 # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1431 if 'b' in (getattr(out, 'mode', None) or ''):
1432 enc = encoding or preferredencoding()
1433 elif hasattr(out, 'buffer'):
1434 buffer = out.buffer
1435 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1436
1437 buffer.write(s.encode(enc, 'ignore') if enc else s)
1438 out.flush()
1439
1440
1441 # TODO: Use global logger
1442 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1443 from .. import _IN_CLI
1444 if _IN_CLI:
1445 if msg in deprecation_warning._cache:
1446 return
1447 deprecation_warning._cache.add(msg)
1448 if printer:
1449 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1450 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1451 else:
1452 import warnings
1453 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1454
1455
1456 deprecation_warning._cache = set()
1457
1458
1459 def bytes_to_intlist(bs):
1460 if not bs:
1461 return []
1462 if isinstance(bs[0], int): # Python 3
1463 return list(bs)
1464 else:
1465 return [ord(c) for c in bs]
1466
1467
1468 def intlist_to_bytes(xs):
1469 if not xs:
1470 return b''
1471 return struct.pack('%dB' % len(xs), *xs)
1472
1473
1474 class LockingUnsupportedError(OSError):
1475 msg = 'File locking is not supported'
1476
1477 def __init__(self):
1478 super().__init__(self.msg)
1479
1480
1481 # Cross-platform file locking
1482 if sys.platform == 'win32':
1483 import ctypes
1484 import ctypes.wintypes
1485 import msvcrt
1486
1487 class OVERLAPPED(ctypes.Structure):
1488 _fields_ = [
1489 ('Internal', ctypes.wintypes.LPVOID),
1490 ('InternalHigh', ctypes.wintypes.LPVOID),
1491 ('Offset', ctypes.wintypes.DWORD),
1492 ('OffsetHigh', ctypes.wintypes.DWORD),
1493 ('hEvent', ctypes.wintypes.HANDLE),
1494 ]
1495
1496 kernel32 = ctypes.WinDLL('kernel32')
1497 LockFileEx = kernel32.LockFileEx
1498 LockFileEx.argtypes = [
1499 ctypes.wintypes.HANDLE, # hFile
1500 ctypes.wintypes.DWORD, # dwFlags
1501 ctypes.wintypes.DWORD, # dwReserved
1502 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1503 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1504 ctypes.POINTER(OVERLAPPED) # Overlapped
1505 ]
1506 LockFileEx.restype = ctypes.wintypes.BOOL
1507 UnlockFileEx = kernel32.UnlockFileEx
1508 UnlockFileEx.argtypes = [
1509 ctypes.wintypes.HANDLE, # hFile
1510 ctypes.wintypes.DWORD, # dwReserved
1511 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1512 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1513 ctypes.POINTER(OVERLAPPED) # Overlapped
1514 ]
1515 UnlockFileEx.restype = ctypes.wintypes.BOOL
1516 whole_low = 0xffffffff
1517 whole_high = 0x7fffffff
1518
1519 def _lock_file(f, exclusive, block):
1520 overlapped = OVERLAPPED()
1521 overlapped.Offset = 0
1522 overlapped.OffsetHigh = 0
1523 overlapped.hEvent = 0
1524 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1525
1526 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1527 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1528 0, whole_low, whole_high, f._lock_file_overlapped_p):
1529 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1530 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1531
1532 def _unlock_file(f):
1533 assert f._lock_file_overlapped_p
1534 handle = msvcrt.get_osfhandle(f.fileno())
1535 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1536 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1537
1538 else:
1539 try:
1540 import fcntl
1541
1542 def _lock_file(f, exclusive, block):
1543 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1544 if not block:
1545 flags |= fcntl.LOCK_NB
1546 try:
1547 fcntl.flock(f, flags)
1548 except BlockingIOError:
1549 raise
1550 except OSError: # AOSP does not have flock()
1551 fcntl.lockf(f, flags)
1552
1553 def _unlock_file(f):
1554 with contextlib.suppress(OSError):
1555 return fcntl.flock(f, fcntl.LOCK_UN)
1556 with contextlib.suppress(OSError):
1557 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1558 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
1559
1560 except ImportError:
1561
1562 def _lock_file(f, exclusive, block):
1563 raise LockingUnsupportedError()
1564
1565 def _unlock_file(f):
1566 raise LockingUnsupportedError()
1567
1568
1569 class locked_file:
1570 locked = False
1571
1572 def __init__(self, filename, mode, block=True, encoding=None):
1573 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1574 raise NotImplementedError(mode)
1575 self.mode, self.block = mode, block
1576
1577 writable = any(f in mode for f in 'wax+')
1578 readable = any(f in mode for f in 'r+')
1579 flags = functools.reduce(operator.ior, (
1580 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1581 getattr(os, 'O_BINARY', 0), # Windows only
1582 getattr(os, 'O_NOINHERIT', 0), # Windows only
1583 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1584 os.O_APPEND if 'a' in mode else 0,
1585 os.O_EXCL if 'x' in mode else 0,
1586 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1587 ))
1588
1589 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1590
1591 def __enter__(self):
1592 exclusive = 'r' not in self.mode
1593 try:
1594 _lock_file(self.f, exclusive, self.block)
1595 self.locked = True
1596 except OSError:
1597 self.f.close()
1598 raise
1599 if 'w' in self.mode:
1600 try:
1601 self.f.truncate()
1602 except OSError as e:
1603 if e.errno not in (
1604 errno.ESPIPE, # Illegal seek - expected for FIFO
1605 errno.EINVAL, # Invalid argument - expected for /dev/null
1606 ):
1607 raise
1608 return self
1609
1610 def unlock(self):
1611 if not self.locked:
1612 return
1613 try:
1614 _unlock_file(self.f)
1615 finally:
1616 self.locked = False
1617
1618 def __exit__(self, *_):
1619 try:
1620 self.unlock()
1621 finally:
1622 self.f.close()
1623
1624 open = __enter__
1625 close = __exit__
1626
1627 def __getattr__(self, attr):
1628 return getattr(self.f, attr)
1629
1630 def __iter__(self):
1631 return iter(self.f)
1632
1633
1634 @functools.cache
1635 def get_filesystem_encoding():
1636 encoding = sys.getfilesystemencoding()
1637 return encoding if encoding is not None else 'utf-8'
1638
1639
1640 def shell_quote(args):
1641 quoted_args = []
1642 encoding = get_filesystem_encoding()
1643 for a in args:
1644 if isinstance(a, bytes):
1645 # We may get a filename encoded with 'encodeFilename'
1646 a = a.decode(encoding)
1647 quoted_args.append(compat_shlex_quote(a))
1648 return ' '.join(quoted_args)
1649
1650
1651 def smuggle_url(url, data):
1652 """ Pass additional data in a URL for internal use. """
1653
1654 url, idata = unsmuggle_url(url, {})
1655 data.update(idata)
1656 sdata = urllib.parse.urlencode(
1657 {'__youtubedl_smuggle': json.dumps(data)})
1658 return url + '#' + sdata
1659
1660
1661 def unsmuggle_url(smug_url, default=None):
1662 if '#__youtubedl_smuggle' not in smug_url:
1663 return smug_url, default
1664 url, _, sdata = smug_url.rpartition('#')
1665 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1666 data = json.loads(jsond)
1667 return url, data
1668
1669
1670 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1671 """ Formats numbers with decimal sufixes like K, M, etc """
1672 num, factor = float_or_none(num), float(factor)
1673 if num is None or num < 0:
1674 return None
1675 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1676 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1677 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1678 if factor == 1024:
1679 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1680 converted = num / (factor ** exponent)
1681 return fmt % (converted, suffix)
1682
1683
1684 def format_bytes(bytes):
1685 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1686
1687
1688 def lookup_unit_table(unit_table, s, strict=False):
1689 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1690 units_re = '|'.join(re.escape(u) for u in unit_table)
1691 m = (re.fullmatch if strict else re.match)(
1692 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1693 if not m:
1694 return None
1695
1696 num = float(m.group('num').replace(',', '.'))
1697 mult = unit_table[m.group('unit')]
1698 return round(num * mult)
1699
1700
1701 def parse_bytes(s):
1702 """Parse a string indicating a byte quantity into an integer"""
1703 return lookup_unit_table(
1704 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1705 s.upper(), strict=True)
1706
1707
1708 def parse_filesize(s):
1709 if s is None:
1710 return None
1711
1712 # The lower-case forms are of course incorrect and unofficial,
1713 # but we support those too
1714 _UNIT_TABLE = {
1715 'B': 1,
1716 'b': 1,
1717 'bytes': 1,
1718 'KiB': 1024,
1719 'KB': 1000,
1720 'kB': 1024,
1721 'Kb': 1000,
1722 'kb': 1000,
1723 'kilobytes': 1000,
1724 'kibibytes': 1024,
1725 'MiB': 1024 ** 2,
1726 'MB': 1000 ** 2,
1727 'mB': 1024 ** 2,
1728 'Mb': 1000 ** 2,
1729 'mb': 1000 ** 2,
1730 'megabytes': 1000 ** 2,
1731 'mebibytes': 1024 ** 2,
1732 'GiB': 1024 ** 3,
1733 'GB': 1000 ** 3,
1734 'gB': 1024 ** 3,
1735 'Gb': 1000 ** 3,
1736 'gb': 1000 ** 3,
1737 'gigabytes': 1000 ** 3,
1738 'gibibytes': 1024 ** 3,
1739 'TiB': 1024 ** 4,
1740 'TB': 1000 ** 4,
1741 'tB': 1024 ** 4,
1742 'Tb': 1000 ** 4,
1743 'tb': 1000 ** 4,
1744 'terabytes': 1000 ** 4,
1745 'tebibytes': 1024 ** 4,
1746 'PiB': 1024 ** 5,
1747 'PB': 1000 ** 5,
1748 'pB': 1024 ** 5,
1749 'Pb': 1000 ** 5,
1750 'pb': 1000 ** 5,
1751 'petabytes': 1000 ** 5,
1752 'pebibytes': 1024 ** 5,
1753 'EiB': 1024 ** 6,
1754 'EB': 1000 ** 6,
1755 'eB': 1024 ** 6,
1756 'Eb': 1000 ** 6,
1757 'eb': 1000 ** 6,
1758 'exabytes': 1000 ** 6,
1759 'exbibytes': 1024 ** 6,
1760 'ZiB': 1024 ** 7,
1761 'ZB': 1000 ** 7,
1762 'zB': 1024 ** 7,
1763 'Zb': 1000 ** 7,
1764 'zb': 1000 ** 7,
1765 'zettabytes': 1000 ** 7,
1766 'zebibytes': 1024 ** 7,
1767 'YiB': 1024 ** 8,
1768 'YB': 1000 ** 8,
1769 'yB': 1024 ** 8,
1770 'Yb': 1000 ** 8,
1771 'yb': 1000 ** 8,
1772 'yottabytes': 1000 ** 8,
1773 'yobibytes': 1024 ** 8,
1774 }
1775
1776 return lookup_unit_table(_UNIT_TABLE, s)
1777
1778
1779 def parse_count(s):
1780 if s is None:
1781 return None
1782
1783 s = re.sub(r'^[^\d]+\s', '', s).strip()
1784
1785 if re.match(r'^[\d,.]+$', s):
1786 return str_to_int(s)
1787
1788 _UNIT_TABLE = {
1789 'k': 1000,
1790 'K': 1000,
1791 'm': 1000 ** 2,
1792 'M': 1000 ** 2,
1793 'kk': 1000 ** 2,
1794 'KK': 1000 ** 2,
1795 'b': 1000 ** 3,
1796 'B': 1000 ** 3,
1797 }
1798
1799 ret = lookup_unit_table(_UNIT_TABLE, s)
1800 if ret is not None:
1801 return ret
1802
1803 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1804 if mobj:
1805 return str_to_int(mobj.group(1))
1806
1807
1808 def parse_resolution(s, *, lenient=False):
1809 if s is None:
1810 return {}
1811
1812 if lenient:
1813 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1814 else:
1815 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1816 if mobj:
1817 return {
1818 'width': int(mobj.group('w')),
1819 'height': int(mobj.group('h')),
1820 }
1821
1822 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1823 if mobj:
1824 return {'height': int(mobj.group(1))}
1825
1826 mobj = re.search(r'\b([48])[kK]\b', s)
1827 if mobj:
1828 return {'height': int(mobj.group(1)) * 540}
1829
1830 return {}
1831
1832
1833 def parse_bitrate(s):
1834 if not isinstance(s, str):
1835 return
1836 mobj = re.search(r'\b(\d+)\s*kbps', s)
1837 if mobj:
1838 return int(mobj.group(1))
1839
1840
1841 def month_by_name(name, lang='en'):
1842 """ Return the number of a month by (locale-independently) English name """
1843
1844 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1845
1846 try:
1847 return month_names.index(name) + 1
1848 except ValueError:
1849 return None
1850
1851
1852 def month_by_abbreviation(abbrev):
1853 """ Return the number of a month by (locale-independently) English
1854 abbreviations """
1855
1856 try:
1857 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1858 except ValueError:
1859 return None
1860
1861
1862 def fix_xml_ampersands(xml_str):
1863 """Replace all the '&' by '&amp;' in XML"""
1864 return re.sub(
1865 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1866 '&amp;',
1867 xml_str)
1868
1869
1870 def setproctitle(title):
1871 assert isinstance(title, str)
1872
1873 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1874 try:
1875 import ctypes
1876 except ImportError:
1877 return
1878
1879 try:
1880 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1881 except OSError:
1882 return
1883 except TypeError:
1884 # LoadLibrary in Windows Python 2.7.13 only expects
1885 # a bytestring, but since unicode_literals turns
1886 # every string into a unicode string, it fails.
1887 return
1888 title_bytes = title.encode()
1889 buf = ctypes.create_string_buffer(len(title_bytes))
1890 buf.value = title_bytes
1891 try:
1892 # PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h
1893 libc.prctl(15, buf, 0, 0, 0)
1894 except AttributeError:
1895 return # Strange libc, just skip this
1896
1897
1898 def remove_start(s, start):
1899 return s[len(start):] if s is not None and s.startswith(start) else s
1900
1901
1902 def remove_end(s, end):
1903 return s[:-len(end)] if s is not None and s.endswith(end) else s
1904
1905
1906 def remove_quotes(s):
1907 if s is None or len(s) < 2:
1908 return s
1909 for quote in ('"', "'", ):
1910 if s[0] == quote and s[-1] == quote:
1911 return s[1:-1]
1912 return s
1913
1914
1915 def get_domain(url):
1916 """
1917 This implementation is inconsistent, but is kept for compatibility.
1918 Use this only for "webpage_url_domain"
1919 """
1920 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1921
1922
1923 def url_basename(url):
1924 path = urllib.parse.urlparse(url).path
1925 return path.strip('/').split('/')[-1]
1926
1927
1928 def base_url(url):
1929 return re.match(r'https?://[^?#]+/', url).group()
1930
1931
1932 def urljoin(base, path):
1933 if isinstance(path, bytes):
1934 path = path.decode()
1935 if not isinstance(path, str) or not path:
1936 return None
1937 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1938 return path
1939 if isinstance(base, bytes):
1940 base = base.decode()
1941 if not isinstance(base, str) or not re.match(
1942 r'^(?:https?:)?//', base):
1943 return None
1944 return urllib.parse.urljoin(base, path)
1945
1946
1947 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1948 if get_attr and v is not None:
1949 v = getattr(v, get_attr, None)
1950 try:
1951 return int(v) * invscale // scale
1952 except (ValueError, TypeError, OverflowError):
1953 return default
1954
1955
1956 def str_or_none(v, default=None):
1957 return default if v is None else str(v)
1958
1959
1960 def str_to_int(int_str):
1961 """ A more relaxed version of int_or_none """
1962 if isinstance(int_str, int):
1963 return int_str
1964 elif isinstance(int_str, str):
1965 int_str = re.sub(r'[,\.\+]', '', int_str)
1966 return int_or_none(int_str)
1967
1968
1969 def float_or_none(v, scale=1, invscale=1, default=None):
1970 if v is None:
1971 return default
1972 try:
1973 return float(v) * invscale / scale
1974 except (ValueError, TypeError):
1975 return default
1976
1977
1978 def bool_or_none(v, default=None):
1979 return v if isinstance(v, bool) else default
1980
1981
1982 def strip_or_none(v, default=None):
1983 return v.strip() if isinstance(v, str) else default
1984
1985
1986 def url_or_none(url):
1987 if not url or not isinstance(url, str):
1988 return None
1989 url = url.strip()
1990 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1991
1992
1993 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1994 datetime_object = None
1995 try:
1996 if isinstance(timestamp, (int, float)): # unix timestamp
1997 # Using naive datetime here can break timestamp() in Windows
1998 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1999 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2000 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2001 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2002 + datetime.timedelta(seconds=timestamp))
2003 elif isinstance(timestamp, str): # assume YYYYMMDD
2004 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2005 date_format = re.sub( # Support %s on windows
2006 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2007 return datetime_object.strftime(date_format)
2008 except (ValueError, TypeError, AttributeError):
2009 return default
2010
2011
2012 def parse_duration(s):
2013 if not isinstance(s, str):
2014 return None
2015 s = s.strip()
2016 if not s:
2017 return None
2018
2019 days, hours, mins, secs, ms = [None] * 5
2020 m = re.match(r'''(?x)
2021 (?P<before_secs>
2022 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2023 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2024 (?P<ms>[.:][0-9]+)?Z?$
2025 ''', s)
2026 if m:
2027 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2028 else:
2029 m = re.match(
2030 r'''(?ix)(?:P?
2031 (?:
2032 [0-9]+\s*y(?:ears?)?,?\s*
2033 )?
2034 (?:
2035 [0-9]+\s*m(?:onths?)?,?\s*
2036 )?
2037 (?:
2038 [0-9]+\s*w(?:eeks?)?,?\s*
2039 )?
2040 (?:
2041 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2042 )?
2043 T)?
2044 (?:
2045 (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2046 )?
2047 (?:
2048 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2049 )?
2050 (?:
2051 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2052 )?Z?$''', s)
2053 if m:
2054 days, hours, mins, secs, ms = m.groups()
2055 else:
2056 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2057 if m:
2058 hours, mins = m.groups()
2059 else:
2060 return None
2061
2062 if ms:
2063 ms = ms.replace(':', '.')
2064 return sum(float(part or 0) * mult for part, mult in (
2065 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2066
2067
2068 def prepend_extension(filename, ext, expected_real_ext=None):
2069 name, real_ext = os.path.splitext(filename)
2070 return (
2071 f'{name}.{ext}{real_ext}'
2072 if not expected_real_ext or real_ext[1:] == expected_real_ext
2073 else f'{filename}.{ext}')
2074
2075
2076 def replace_extension(filename, ext, expected_real_ext=None):
2077 name, real_ext = os.path.splitext(filename)
2078 return '{}.{}'.format(
2079 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2080 ext)
2081
2082
2083 def check_executable(exe, args=[]):
2084 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2085 args can be a list of arguments for a short output (like -version) """
2086 try:
2087 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2088 except OSError:
2089 return False
2090 return exe
2091
2092
2093 def _get_exe_version_output(exe, args):
2094 try:
2095 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2096 # SIGTTOU if yt-dlp is run in the background.
2097 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2098 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2099 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2100 if ret:
2101 return None
2102 except OSError:
2103 return False
2104 return stdout
2105
2106
2107 def detect_exe_version(output, version_re=None, unrecognized='present'):
2108 assert isinstance(output, str)
2109 if version_re is None:
2110 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2111 m = re.search(version_re, output)
2112 if m:
2113 return m.group(1)
2114 else:
2115 return unrecognized
2116
2117
2118 def get_exe_version(exe, args=['--version'],
2119 version_re=None, unrecognized=('present', 'broken')):
2120 """ Returns the version of the specified executable,
2121 or False if the executable is not present """
2122 unrecognized = variadic(unrecognized)
2123 assert len(unrecognized) in (1, 2)
2124 out = _get_exe_version_output(exe, args)
2125 if out is None:
2126 return unrecognized[-1]
2127 return out and detect_exe_version(out, version_re, unrecognized[0])
2128
2129
2130 def frange(start=0, stop=None, step=1):
2131 """Float range"""
2132 if stop is None:
2133 start, stop = 0, start
2134 sign = [-1, 1][step > 0] if step else 0
2135 while sign * start < sign * stop:
2136 yield start
2137 start += step
2138
2139
2140 class LazyList(collections.abc.Sequence):
2141 """Lazy immutable list from an iterable
2142 Note that slices of a LazyList are lists and not LazyList"""
2143
2144 class IndexError(IndexError):
2145 pass
2146
2147 def __init__(self, iterable, *, reverse=False, _cache=None):
2148 self._iterable = iter(iterable)
2149 self._cache = [] if _cache is None else _cache
2150 self._reversed = reverse
2151
2152 def __iter__(self):
2153 if self._reversed:
2154 # We need to consume the entire iterable to iterate in reverse
2155 yield from self.exhaust()
2156 return
2157 yield from self._cache
2158 for item in self._iterable:
2159 self._cache.append(item)
2160 yield item
2161
2162 def _exhaust(self):
2163 self._cache.extend(self._iterable)
2164 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2165 return self._cache
2166
2167 def exhaust(self):
2168 """Evaluate the entire iterable"""
2169 return self._exhaust()[::-1 if self._reversed else 1]
2170
2171 @staticmethod
2172 def _reverse_index(x):
2173 return None if x is None else ~x
2174
2175 def __getitem__(self, idx):
2176 if isinstance(idx, slice):
2177 if self._reversed:
2178 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2179 start, stop, step = idx.start, idx.stop, idx.step or 1
2180 elif isinstance(idx, int):
2181 if self._reversed:
2182 idx = self._reverse_index(idx)
2183 start, stop, step = idx, idx, 0
2184 else:
2185 raise TypeError('indices must be integers or slices')
2186 if ((start or 0) < 0 or (stop or 0) < 0
2187 or (start is None and step < 0)
2188 or (stop is None and step > 0)):
2189 # We need to consume the entire iterable to be able to slice from the end
2190 # Obviously, never use this with infinite iterables
2191 self._exhaust()
2192 try:
2193 return self._cache[idx]
2194 except IndexError as e:
2195 raise self.IndexError(e) from e
2196 n = max(start or 0, stop or 0) - len(self._cache) + 1
2197 if n > 0:
2198 self._cache.extend(itertools.islice(self._iterable, n))
2199 try:
2200 return self._cache[idx]
2201 except IndexError as e:
2202 raise self.IndexError(e) from e
2203
2204 def __bool__(self):
2205 try:
2206 self[-1] if self._reversed else self[0]
2207 except self.IndexError:
2208 return False
2209 return True
2210
2211 def __len__(self):
2212 self._exhaust()
2213 return len(self._cache)
2214
2215 def __reversed__(self):
2216 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2217
2218 def __copy__(self):
2219 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2220
2221 def __repr__(self):
2222 # repr and str should mimic a list. So we exhaust the iterable
2223 return repr(self.exhaust())
2224
2225 def __str__(self):
2226 return repr(self.exhaust())
2227
2228
2229 class PagedList:
2230
2231 class IndexError(IndexError):
2232 pass
2233
2234 def __len__(self):
2235 # This is only useful for tests
2236 return len(self.getslice())
2237
2238 def __init__(self, pagefunc, pagesize, use_cache=True):
2239 self._pagefunc = pagefunc
2240 self._pagesize = pagesize
2241 self._pagecount = float('inf')
2242 self._use_cache = use_cache
2243 self._cache = {}
2244
2245 def getpage(self, pagenum):
2246 page_results = self._cache.get(pagenum)
2247 if page_results is None:
2248 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2249 if self._use_cache:
2250 self._cache[pagenum] = page_results
2251 return page_results
2252
2253 def getslice(self, start=0, end=None):
2254 return list(self._getslice(start, end))
2255
2256 def _getslice(self, start, end):
2257 raise NotImplementedError('This method must be implemented by subclasses')
2258
2259 def __getitem__(self, idx):
2260 assert self._use_cache, 'Indexing PagedList requires cache'
2261 if not isinstance(idx, int) or idx < 0:
2262 raise TypeError('indices must be non-negative integers')
2263 entries = self.getslice(idx, idx + 1)
2264 if not entries:
2265 raise self.IndexError()
2266 return entries[0]
2267
2268 def __bool__(self):
2269 return bool(self.getslice(0, 1))
2270
2271
2272 class OnDemandPagedList(PagedList):
2273 """Download pages until a page with less than maximum results"""
2274
2275 def _getslice(self, start, end):
2276 for pagenum in itertools.count(start // self._pagesize):
2277 firstid = pagenum * self._pagesize
2278 nextfirstid = pagenum * self._pagesize + self._pagesize
2279 if start >= nextfirstid:
2280 continue
2281
2282 startv = (
2283 start % self._pagesize
2284 if firstid <= start < nextfirstid
2285 else 0)
2286 endv = (
2287 ((end - 1) % self._pagesize) + 1
2288 if (end is not None and firstid <= end <= nextfirstid)
2289 else None)
2290
2291 try:
2292 page_results = self.getpage(pagenum)
2293 except Exception:
2294 self._pagecount = pagenum - 1
2295 raise
2296 if startv != 0 or endv is not None:
2297 page_results = page_results[startv:endv]
2298 yield from page_results
2299
2300 # A little optimization - if current page is not "full", ie. does
2301 # not contain page_size videos then we can assume that this page
2302 # is the last one - there are no more ids on further pages -
2303 # i.e. no need to query again.
2304 if len(page_results) + startv < self._pagesize:
2305 break
2306
2307 # If we got the whole page, but the next page is not interesting,
2308 # break out early as well
2309 if end == nextfirstid:
2310 break
2311
2312
2313 class InAdvancePagedList(PagedList):
2314 """PagedList with total number of pages known in advance"""
2315
2316 def __init__(self, pagefunc, pagecount, pagesize):
2317 PagedList.__init__(self, pagefunc, pagesize, True)
2318 self._pagecount = pagecount
2319
2320 def _getslice(self, start, end):
2321 start_page = start // self._pagesize
2322 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2323 skip_elems = start - start_page * self._pagesize
2324 only_more = None if end is None else end - start
2325 for pagenum in range(start_page, end_page):
2326 page_results = self.getpage(pagenum)
2327 if skip_elems:
2328 page_results = page_results[skip_elems:]
2329 skip_elems = None
2330 if only_more is not None:
2331 if len(page_results) < only_more:
2332 only_more -= len(page_results)
2333 else:
2334 yield from page_results[:only_more]
2335 break
2336 yield from page_results
2337
2338
2339 class PlaylistEntries:
2340 MissingEntry = object()
2341 is_exhausted = False
2342
2343 def __init__(self, ydl, info_dict):
2344 self.ydl = ydl
2345
2346 # _entries must be assigned now since infodict can change during iteration
2347 entries = info_dict.get('entries')
2348 if entries is None:
2349 raise EntryNotInPlaylist('There are no entries')
2350 elif isinstance(entries, list):
2351 self.is_exhausted = True
2352
2353 requested_entries = info_dict.get('requested_entries')
2354 self.is_incomplete = requested_entries is not None
2355 if self.is_incomplete:
2356 assert self.is_exhausted
2357 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2358 for i, entry in zip(requested_entries, entries):
2359 self._entries[i - 1] = entry
2360 elif isinstance(entries, (list, PagedList, LazyList)):
2361 self._entries = entries
2362 else:
2363 self._entries = LazyList(entries)
2364
2365 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2366 (?P<start>[+-]?\d+)?
2367 (?P<range>[:-]
2368 (?P<end>[+-]?\d+|inf(?:inite)?)?
2369 (?::(?P<step>[+-]?\d+))?
2370 )?''')
2371
2372 @classmethod
2373 def parse_playlist_items(cls, string):
2374 for segment in string.split(','):
2375 if not segment:
2376 raise ValueError('There is two or more consecutive commas')
2377 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2378 if not mobj:
2379 raise ValueError(f'{segment!r} is not a valid specification')
2380 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2381 if int_or_none(step) == 0:
2382 raise ValueError(f'Step in {segment!r} cannot be zero')
2383 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2384
2385 def get_requested_items(self):
2386 playlist_items = self.ydl.params.get('playlist_items')
2387 playlist_start = self.ydl.params.get('playliststart', 1)
2388 playlist_end = self.ydl.params.get('playlistend')
2389 # For backwards compatibility, interpret -1 as whole list
2390 if playlist_end in (-1, None):
2391 playlist_end = ''
2392 if not playlist_items:
2393 playlist_items = f'{playlist_start}:{playlist_end}'
2394 elif playlist_start != 1 or playlist_end:
2395 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2396
2397 for index in self.parse_playlist_items(playlist_items):
2398 for i, entry in self[index]:
2399 yield i, entry
2400 if not entry:
2401 continue
2402 try:
2403 # The item may have just been added to archive. Don't break due to it
2404 if not self.ydl.params.get('lazy_playlist'):
2405 # TODO: Add auto-generated fields
2406 self.ydl._match_entry(entry, incomplete=True, silent=True)
2407 except (ExistingVideoReached, RejectedVideoReached):
2408 return
2409
2410 def get_full_count(self):
2411 if self.is_exhausted and not self.is_incomplete:
2412 return len(self)
2413 elif isinstance(self._entries, InAdvancePagedList):
2414 if self._entries._pagesize == 1:
2415 return self._entries._pagecount
2416
2417 @functools.cached_property
2418 def _getter(self):
2419 if isinstance(self._entries, list):
2420 def get_entry(i):
2421 try:
2422 entry = self._entries[i]
2423 except IndexError:
2424 entry = self.MissingEntry
2425 if not self.is_incomplete:
2426 raise self.IndexError()
2427 if entry is self.MissingEntry:
2428 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2429 return entry
2430 else:
2431 def get_entry(i):
2432 try:
2433 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2434 except (LazyList.IndexError, PagedList.IndexError):
2435 raise self.IndexError()
2436 return get_entry
2437
2438 def __getitem__(self, idx):
2439 if isinstance(idx, int):
2440 idx = slice(idx, idx)
2441
2442 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2443 step = 1 if idx.step is None else idx.step
2444 if idx.start is None:
2445 start = 0 if step > 0 else len(self) - 1
2446 else:
2447 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2448
2449 # NB: Do not call len(self) when idx == [:]
2450 if idx.stop is None:
2451 stop = 0 if step < 0 else float('inf')
2452 else:
2453 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2454 stop += [-1, 1][step > 0]
2455
2456 for i in frange(start, stop, step):
2457 if i < 0:
2458 continue
2459 try:
2460 entry = self._getter(i)
2461 except self.IndexError:
2462 self.is_exhausted = True
2463 if step > 0:
2464 break
2465 continue
2466 yield i + 1, entry
2467
2468 def __len__(self):
2469 return len(tuple(self[:]))
2470
2471 class IndexError(IndexError):
2472 pass
2473
2474
2475 def uppercase_escape(s):
2476 unicode_escape = codecs.getdecoder('unicode_escape')
2477 return re.sub(
2478 r'\\U[0-9a-fA-F]{8}',
2479 lambda m: unicode_escape(m.group(0))[0],
2480 s)
2481
2482
2483 def lowercase_escape(s):
2484 unicode_escape = codecs.getdecoder('unicode_escape')
2485 return re.sub(
2486 r'\\u[0-9a-fA-F]{4}',
2487 lambda m: unicode_escape(m.group(0))[0],
2488 s)
2489
2490
2491 def parse_qs(url, **kwargs):
2492 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2493
2494
2495 def read_batch_urls(batch_fd):
2496 def fixup(url):
2497 if not isinstance(url, str):
2498 url = url.decode('utf-8', 'replace')
2499 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2500 for bom in BOM_UTF8:
2501 if url.startswith(bom):
2502 url = url[len(bom):]
2503 url = url.lstrip()
2504 if not url or url.startswith(('#', ';', ']')):
2505 return False
2506 # "#" cannot be stripped out since it is part of the URI
2507 # However, it can be safely stripped out if following a whitespace
2508 return re.split(r'\s#', url, 1)[0].rstrip()
2509
2510 with contextlib.closing(batch_fd) as fd:
2511 return [url for url in map(fixup, fd) if url]
2512
2513
2514 def urlencode_postdata(*args, **kargs):
2515 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2516
2517
2518 def update_url(url, *, query_update=None, **kwargs):
2519 """Replace URL components specified by kwargs
2520 @param url str or parse url tuple
2521 @param query_update update query
2522 @returns str
2523 """
2524 if isinstance(url, str):
2525 if not kwargs and not query_update:
2526 return url
2527 else:
2528 url = urllib.parse.urlparse(url)
2529 if query_update:
2530 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2531 kwargs['query'] = urllib.parse.urlencode({
2532 **urllib.parse.parse_qs(url.query),
2533 **query_update
2534 }, True)
2535 return urllib.parse.urlunparse(url._replace(**kwargs))
2536
2537
2538 def update_url_query(url, query):
2539 return update_url(url, query_update=query)
2540
2541
2542 def _multipart_encode_impl(data, boundary):
2543 content_type = 'multipart/form-data; boundary=%s' % boundary
2544
2545 out = b''
2546 for k, v in data.items():
2547 out += b'--' + boundary.encode('ascii') + b'\r\n'
2548 if isinstance(k, str):
2549 k = k.encode()
2550 if isinstance(v, str):
2551 v = v.encode()
2552 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2553 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2554 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2555 if boundary.encode('ascii') in content:
2556 raise ValueError('Boundary overlaps with data')
2557 out += content
2558
2559 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2560
2561 return out, content_type
2562
2563
2564 def multipart_encode(data, boundary=None):
2565 '''
2566 Encode a dict to RFC 7578-compliant form-data
2567
2568 data:
2569 A dict where keys and values can be either Unicode or bytes-like
2570 objects.
2571 boundary:
2572 If specified a Unicode object, it's used as the boundary. Otherwise
2573 a random boundary is generated.
2574
2575 Reference: https://tools.ietf.org/html/rfc7578
2576 '''
2577 has_specified_boundary = boundary is not None
2578
2579 while True:
2580 if boundary is None:
2581 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2582
2583 try:
2584 out, content_type = _multipart_encode_impl(data, boundary)
2585 break
2586 except ValueError:
2587 if has_specified_boundary:
2588 raise
2589 boundary = None
2590
2591 return out, content_type
2592
2593
2594 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2595 if blocked_types is NO_DEFAULT:
2596 blocked_types = (str, bytes, collections.abc.Mapping)
2597 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2598
2599
2600 def variadic(x, allowed_types=NO_DEFAULT):
2601 if not isinstance(allowed_types, (tuple, type)):
2602 deprecation_warning('allowed_types should be a tuple or a type')
2603 allowed_types = tuple(allowed_types)
2604 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2605
2606
2607 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2608 for f in funcs:
2609 try:
2610 val = f(*args, **kwargs)
2611 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2612 pass
2613 else:
2614 if expected_type is None or isinstance(val, expected_type):
2615 return val
2616
2617
2618 def try_get(src, getter, expected_type=None):
2619 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2620
2621
2622 def filter_dict(dct, cndn=lambda _, v: v is not None):
2623 return {k: v for k, v in dct.items() if cndn(k, v)}
2624
2625
2626 def merge_dicts(*dicts):
2627 merged = {}
2628 for a_dict in dicts:
2629 for k, v in a_dict.items():
2630 if (v is not None and k not in merged
2631 or isinstance(v, str) and merged[k] == ''):
2632 merged[k] = v
2633 return merged
2634
2635
2636 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2637 return string if isinstance(string, str) else str(string, encoding, errors)
2638
2639
2640 US_RATINGS = {
2641 'G': 0,
2642 'PG': 10,
2643 'PG-13': 13,
2644 'R': 16,
2645 'NC': 18,
2646 }
2647
2648
2649 TV_PARENTAL_GUIDELINES = {
2650 'TV-Y': 0,
2651 'TV-Y7': 7,
2652 'TV-G': 0,
2653 'TV-PG': 0,
2654 'TV-14': 14,
2655 'TV-MA': 17,
2656 }
2657
2658
2659 def parse_age_limit(s):
2660 # isinstance(False, int) is True. So type() must be used instead
2661 if type(s) is int: # noqa: E721
2662 return s if 0 <= s <= 21 else None
2663 elif not isinstance(s, str):
2664 return None
2665 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2666 if m:
2667 return int(m.group('age'))
2668 s = s.upper()
2669 if s in US_RATINGS:
2670 return US_RATINGS[s]
2671 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2672 if m:
2673 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2674 return None
2675
2676
2677 def strip_jsonp(code):
2678 return re.sub(
2679 r'''(?sx)^
2680 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2681 (?:\s*&&\s*(?P=func_name))?
2682 \s*\(\s*(?P<callback_data>.*)\);?
2683 \s*?(?://[^\n]*)*$''',
2684 r'\g<callback_data>', code)
2685
2686
2687 def js_to_json(code, vars={}, *, strict=False):
2688 # vars is a dict of var, val pairs to substitute
2689 STRING_QUOTES = '\'"`'
2690 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2691 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2692 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2693 INTEGER_TABLE = (
2694 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2695 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2696 )
2697
2698 def process_escape(match):
2699 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2700 escape = match.group(1) or match.group(2)
2701
2702 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2703 else R'\u00' if escape == 'x'
2704 else '' if escape == '\n'
2705 else escape)
2706
2707 def template_substitute(match):
2708 evaluated = js_to_json(match.group(1), vars, strict=strict)
2709 if evaluated[0] == '"':
2710 return json.loads(evaluated)
2711 return evaluated
2712
2713 def fix_kv(m):
2714 v = m.group(0)
2715 if v in ('true', 'false', 'null'):
2716 return v
2717 elif v in ('undefined', 'void 0'):
2718 return 'null'
2719 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2720 return ''
2721
2722 if v[0] in STRING_QUOTES:
2723 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2724 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2725 return f'"{escaped}"'
2726
2727 for regex, base in INTEGER_TABLE:
2728 im = re.match(regex, v)
2729 if im:
2730 i = int(im.group(1), base)
2731 return f'"{i}":' if v.endswith(':') else str(i)
2732
2733 if v in vars:
2734 try:
2735 if not strict:
2736 json.loads(vars[v])
2737 except json.JSONDecodeError:
2738 return json.dumps(vars[v])
2739 else:
2740 return vars[v]
2741
2742 if not strict:
2743 return f'"{v}"'
2744
2745 raise ValueError(f'Unknown value: {v}')
2746
2747 def create_map(mobj):
2748 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2749
2750 code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2751 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2752 if not strict:
2753 code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2754 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2755 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2756 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2757
2758 return re.sub(rf'''(?sx)
2759 {STRING_RE}|
2760 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2761 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2762 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2763 [0-9]+(?={SKIP_RE}:)|
2764 !+
2765 ''', fix_kv, code)
2766
2767
2768 def qualities(quality_ids):
2769 """ Get a numeric quality value out of a list of possible values """
2770 def q(qid):
2771 try:
2772 return quality_ids.index(qid)
2773 except ValueError:
2774 return -1
2775 return q
2776
2777
2778 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2779
2780
2781 DEFAULT_OUTTMPL = {
2782 'default': '%(title)s [%(id)s].%(ext)s',
2783 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2784 }
2785 OUTTMPL_TYPES = {
2786 'chapter': None,
2787 'subtitle': None,
2788 'thumbnail': None,
2789 'description': 'description',
2790 'annotation': 'annotations.xml',
2791 'infojson': 'info.json',
2792 'link': None,
2793 'pl_video': None,
2794 'pl_thumbnail': None,
2795 'pl_description': 'description',
2796 'pl_infojson': 'info.json',
2797 }
2798
2799 # As of [1] format syntax is:
2800 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2801 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2802 STR_FORMAT_RE_TMPL = r'''(?x)
2803 (?<!%)(?P<prefix>(?:%%)*)
2804 %
2805 (?P<has_key>\((?P<key>{0})\))?
2806 (?P<format>
2807 (?P<conversion>[#0\-+ ]+)?
2808 (?P<min_width>\d+)?
2809 (?P<precision>\.\d+)?
2810 (?P<len_mod>[hlL])? # unused in python
2811 {1} # conversion type
2812 )
2813 '''
2814
2815
2816 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2817
2818
2819 def limit_length(s, length):
2820 """ Add ellipses to overly long strings """
2821 if s is None:
2822 return None
2823 ELLIPSES = '...'
2824 if len(s) > length:
2825 return s[:length - len(ELLIPSES)] + ELLIPSES
2826 return s
2827
2828
2829 def version_tuple(v):
2830 return tuple(int(e) for e in re.split(r'[-.]', v))
2831
2832
2833 def is_outdated_version(version, limit, assume_new=True):
2834 if not version:
2835 return not assume_new
2836 try:
2837 return version_tuple(version) < version_tuple(limit)
2838 except ValueError:
2839 return not assume_new
2840
2841
2842 def ytdl_is_updateable():
2843 """ Returns if yt-dlp can be updated with -U """
2844
2845 from ..update import is_non_updateable
2846
2847 return not is_non_updateable()
2848
2849
2850 def args_to_str(args):
2851 # Get a short string representation for a subprocess command
2852 return ' '.join(compat_shlex_quote(a) for a in args)
2853
2854
2855 def error_to_str(err):
2856 return f'{type(err).__name__}: {err}'
2857
2858
2859 def mimetype2ext(mt, default=NO_DEFAULT):
2860 if not isinstance(mt, str):
2861 if default is not NO_DEFAULT:
2862 return default
2863 return None
2864
2865 MAP = {
2866 # video
2867 '3gpp': '3gp',
2868 'mp2t': 'ts',
2869 'mp4': 'mp4',
2870 'mpeg': 'mpeg',
2871 'mpegurl': 'm3u8',
2872 'quicktime': 'mov',
2873 'webm': 'webm',
2874 'vp9': 'vp9',
2875 'video/ogg': 'ogv',
2876 'x-flv': 'flv',
2877 'x-m4v': 'm4v',
2878 'x-matroska': 'mkv',
2879 'x-mng': 'mng',
2880 'x-mp4-fragmented': 'mp4',
2881 'x-ms-asf': 'asf',
2882 'x-ms-wmv': 'wmv',
2883 'x-msvideo': 'avi',
2884
2885 # application (streaming playlists)
2886 'dash+xml': 'mpd',
2887 'f4m+xml': 'f4m',
2888 'hds+xml': 'f4m',
2889 'vnd.apple.mpegurl': 'm3u8',
2890 'vnd.ms-sstr+xml': 'ism',
2891 'x-mpegurl': 'm3u8',
2892
2893 # audio
2894 'audio/mp4': 'm4a',
2895 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2896 # Using .mp3 as it's the most popular one
2897 'audio/mpeg': 'mp3',
2898 'audio/webm': 'webm',
2899 'audio/x-matroska': 'mka',
2900 'audio/x-mpegurl': 'm3u',
2901 'midi': 'mid',
2902 'ogg': 'ogg',
2903 'wav': 'wav',
2904 'wave': 'wav',
2905 'x-aac': 'aac',
2906 'x-flac': 'flac',
2907 'x-m4a': 'm4a',
2908 'x-realaudio': 'ra',
2909 'x-wav': 'wav',
2910
2911 # image
2912 'avif': 'avif',
2913 'bmp': 'bmp',
2914 'gif': 'gif',
2915 'jpeg': 'jpg',
2916 'png': 'png',
2917 'svg+xml': 'svg',
2918 'tiff': 'tif',
2919 'vnd.wap.wbmp': 'wbmp',
2920 'webp': 'webp',
2921 'x-icon': 'ico',
2922 'x-jng': 'jng',
2923 'x-ms-bmp': 'bmp',
2924
2925 # caption
2926 'filmstrip+json': 'fs',
2927 'smptett+xml': 'tt',
2928 'ttaf+xml': 'dfxp',
2929 'ttml+xml': 'ttml',
2930 'x-ms-sami': 'sami',
2931
2932 # misc
2933 'gzip': 'gz',
2934 'json': 'json',
2935 'xml': 'xml',
2936 'zip': 'zip',
2937 }
2938
2939 mimetype = mt.partition(';')[0].strip().lower()
2940 _, _, subtype = mimetype.rpartition('/')
2941
2942 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2943 if ext:
2944 return ext
2945 elif default is not NO_DEFAULT:
2946 return default
2947 return subtype.replace('+', '.')
2948
2949
2950 def ext2mimetype(ext_or_url):
2951 if not ext_or_url:
2952 return None
2953 if '.' not in ext_or_url:
2954 ext_or_url = f'file.{ext_or_url}'
2955 return mimetypes.guess_type(ext_or_url)[0]
2956
2957
2958 def parse_codecs(codecs_str):
2959 # http://tools.ietf.org/html/rfc6381
2960 if not codecs_str:
2961 return {}
2962 split_codecs = list(filter(None, map(
2963 str.strip, codecs_str.strip().strip(',').split(','))))
2964 vcodec, acodec, scodec, hdr = None, None, None, None
2965 for full_codec in split_codecs:
2966 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2967 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2968 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2969 if vcodec:
2970 continue
2971 vcodec = full_codec
2972 if parts[0] in ('dvh1', 'dvhe'):
2973 hdr = 'DV'
2974 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2975 hdr = 'HDR10'
2976 elif parts[:2] == ['vp9', '2']:
2977 hdr = 'HDR10'
2978 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2979 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2980 acodec = acodec or full_codec
2981 elif parts[0] in ('stpp', 'wvtt'):
2982 scodec = scodec or full_codec
2983 else:
2984 write_string(f'WARNING: Unknown codec {full_codec}\n')
2985 if vcodec or acodec or scodec:
2986 return {
2987 'vcodec': vcodec or 'none',
2988 'acodec': acodec or 'none',
2989 'dynamic_range': hdr,
2990 **({'scodec': scodec} if scodec is not None else {}),
2991 }
2992 elif len(split_codecs) == 2:
2993 return {
2994 'vcodec': split_codecs[0],
2995 'acodec': split_codecs[1],
2996 }
2997 return {}
2998
2999
3000 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3001 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3002
3003 allow_mkv = not preferences or 'mkv' in preferences
3004
3005 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3006 return 'mkv' # TODO: any other format allows this?
3007
3008 # TODO: All codecs supported by parse_codecs isn't handled here
3009 COMPATIBLE_CODECS = {
3010 'mp4': {
3011 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3012 'h264', 'aacl', 'ec-3', # Set in ISM
3013 },
3014 'webm': {
3015 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3016 'vp9x', 'vp8x', # in the webm spec
3017 },
3018 }
3019
3020 sanitize_codec = functools.partial(
3021 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3022 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3023
3024 for ext in preferences or COMPATIBLE_CODECS.keys():
3025 codec_set = COMPATIBLE_CODECS.get(ext, set())
3026 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3027 return ext
3028
3029 COMPATIBLE_EXTS = (
3030 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3031 {'webm', 'weba'},
3032 )
3033 for ext in preferences or vexts:
3034 current_exts = {ext, *vexts, *aexts}
3035 if ext == 'mkv' or current_exts == {ext} or any(
3036 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3037 return ext
3038 return 'mkv' if allow_mkv else preferences[-1]
3039
3040
3041 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3042 getheader = url_handle.headers.get
3043
3044 cd = getheader('Content-Disposition')
3045 if cd:
3046 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3047 if m:
3048 e = determine_ext(m.group('filename'), default_ext=None)
3049 if e:
3050 return e
3051
3052 meta_ext = getheader('x-amz-meta-name')
3053 if meta_ext:
3054 e = meta_ext.rpartition('.')[2]
3055 if e:
3056 return e
3057
3058 return mimetype2ext(getheader('Content-Type'), default=default)
3059
3060
3061 def encode_data_uri(data, mime_type):
3062 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3063
3064
3065 def age_restricted(content_limit, age_limit):
3066 """ Returns True iff the content should be blocked """
3067
3068 if age_limit is None: # No limit set
3069 return False
3070 if content_limit is None:
3071 return False # Content available for everyone
3072 return age_limit < content_limit
3073
3074
3075 # List of known byte-order-marks (BOM)
3076 BOMS = [
3077 (b'\xef\xbb\xbf', 'utf-8'),
3078 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3079 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3080 (b'\xff\xfe', 'utf-16-le'),
3081 (b'\xfe\xff', 'utf-16-be'),
3082 ]
3083
3084
3085 def is_html(first_bytes):
3086 """ Detect whether a file contains HTML by examining its first bytes. """
3087
3088 encoding = 'utf-8'
3089 for bom, enc in BOMS:
3090 while first_bytes.startswith(bom):
3091 encoding, first_bytes = enc, first_bytes[len(bom):]
3092
3093 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3094
3095
3096 def determine_protocol(info_dict):
3097 protocol = info_dict.get('protocol')
3098 if protocol is not None:
3099 return protocol
3100
3101 url = sanitize_url(info_dict['url'])
3102 if url.startswith('rtmp'):
3103 return 'rtmp'
3104 elif url.startswith('mms'):
3105 return 'mms'
3106 elif url.startswith('rtsp'):
3107 return 'rtsp'
3108
3109 ext = determine_ext(url)
3110 if ext == 'm3u8':
3111 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3112 elif ext == 'f4m':
3113 return 'f4m'
3114
3115 return urllib.parse.urlparse(url).scheme
3116
3117
3118 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3119 """ Render a list of rows, each as a list of values.
3120 Text after a \t will be right aligned """
3121 def width(string):
3122 return len(remove_terminal_sequences(string).replace('\t', ''))
3123
3124 def get_max_lens(table):
3125 return [max(width(str(v)) for v in col) for col in zip(*table)]
3126
3127 def filter_using_list(row, filterArray):
3128 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3129
3130 max_lens = get_max_lens(data) if hide_empty else []
3131 header_row = filter_using_list(header_row, max_lens)
3132 data = [filter_using_list(row, max_lens) for row in data]
3133
3134 table = [header_row] + data
3135 max_lens = get_max_lens(table)
3136 extra_gap += 1
3137 if delim:
3138 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3139 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3140 for row in table:
3141 for pos, text in enumerate(map(str, row)):
3142 if '\t' in text:
3143 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3144 else:
3145 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3146 ret = '\n'.join(''.join(row).rstrip() for row in table)
3147 return ret
3148
3149
3150 def _match_one(filter_part, dct, incomplete):
3151 # TODO: Generalize code with YoutubeDL._build_format_filter
3152 STRING_OPERATORS = {
3153 '*=': operator.contains,
3154 '^=': lambda attr, value: attr.startswith(value),
3155 '$=': lambda attr, value: attr.endswith(value),
3156 '~=': lambda attr, value: re.search(value, attr),
3157 }
3158 COMPARISON_OPERATORS = {
3159 **STRING_OPERATORS,
3160 '<=': operator.le, # "<=" must be defined above "<"
3161 '<': operator.lt,
3162 '>=': operator.ge,
3163 '>': operator.gt,
3164 '=': operator.eq,
3165 }
3166
3167 if isinstance(incomplete, bool):
3168 is_incomplete = lambda _: incomplete
3169 else:
3170 is_incomplete = lambda k: k in incomplete
3171
3172 operator_rex = re.compile(r'''(?x)
3173 (?P<key>[a-z_]+)
3174 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3175 (?:
3176 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3177 (?P<strval>.+?)
3178 )
3179 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3180 m = operator_rex.fullmatch(filter_part.strip())
3181 if m:
3182 m = m.groupdict()
3183 unnegated_op = COMPARISON_OPERATORS[m['op']]
3184 if m['negation']:
3185 op = lambda attr, value: not unnegated_op(attr, value)
3186 else:
3187 op = unnegated_op
3188 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3189 if m['quote']:
3190 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3191 actual_value = dct.get(m['key'])
3192 numeric_comparison = None
3193 if isinstance(actual_value, (int, float)):
3194 # If the original field is a string and matching comparisonvalue is
3195 # a number we should respect the origin of the original field
3196 # and process comparison value as a string (see
3197 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3198 try:
3199 numeric_comparison = int(comparison_value)
3200 except ValueError:
3201 numeric_comparison = parse_filesize(comparison_value)
3202 if numeric_comparison is None:
3203 numeric_comparison = parse_filesize(f'{comparison_value}B')
3204 if numeric_comparison is None:
3205 numeric_comparison = parse_duration(comparison_value)
3206 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3207 raise ValueError('Operator %s only supports string values!' % m['op'])
3208 if actual_value is None:
3209 return is_incomplete(m['key']) or m['none_inclusive']
3210 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3211
3212 UNARY_OPERATORS = {
3213 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3214 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3215 }
3216 operator_rex = re.compile(r'''(?x)
3217 (?P<op>%s)\s*(?P<key>[a-z_]+)
3218 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3219 m = operator_rex.fullmatch(filter_part.strip())
3220 if m:
3221 op = UNARY_OPERATORS[m.group('op')]
3222 actual_value = dct.get(m.group('key'))
3223 if is_incomplete(m.group('key')) and actual_value is None:
3224 return True
3225 return op(actual_value)
3226
3227 raise ValueError('Invalid filter part %r' % filter_part)
3228
3229
3230 def match_str(filter_str, dct, incomplete=False):
3231 """ Filter a dictionary with a simple string syntax.
3232 @returns Whether the filter passes
3233 @param incomplete Set of keys that is expected to be missing from dct.
3234 Can be True/False to indicate all/none of the keys may be missing.
3235 All conditions on incomplete keys pass if the key is missing
3236 """
3237 return all(
3238 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3239 for filter_part in re.split(r'(?<!\\)&', filter_str))
3240
3241
3242 def match_filter_func(filters, breaking_filters=None):
3243 if not filters and not breaking_filters:
3244 return None
3245 repr_ = f'{match_filter_func.__module__}.{match_filter_func.__qualname__}({filters}, {breaking_filters})'
3246
3247 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3248 filters = set(variadic(filters or []))
3249
3250 interactive = '-' in filters
3251 if interactive:
3252 filters.remove('-')
3253
3254 @function_with_repr.set_repr(repr_)
3255 def _match_func(info_dict, incomplete=False):
3256 ret = breaking_filters(info_dict, incomplete)
3257 if ret is not None:
3258 raise RejectedVideoReached(ret)
3259
3260 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3261 return NO_DEFAULT if interactive and not incomplete else None
3262 else:
3263 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3264 filter_str = ') | ('.join(map(str.strip, filters))
3265 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3266 return _match_func
3267
3268
3269 class download_range_func:
3270 def __init__(self, chapters, ranges, from_info=False):
3271 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3272
3273 def __call__(self, info_dict, ydl):
3274
3275 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3276 else 'Cannot match chapters since chapter information is unavailable')
3277 for regex in self.chapters or []:
3278 for i, chapter in enumerate(info_dict.get('chapters') or []):
3279 if re.search(regex, chapter['title']):
3280 warning = None
3281 yield {**chapter, 'index': i}
3282 if self.chapters and warning:
3283 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3284
3285 for start, end in self.ranges or []:
3286 yield {
3287 'start_time': self._handle_negative_timestamp(start, info_dict),
3288 'end_time': self._handle_negative_timestamp(end, info_dict),
3289 }
3290
3291 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3292 yield {
3293 'start_time': info_dict.get('start_time') or 0,
3294 'end_time': info_dict.get('end_time') or float('inf'),
3295 }
3296 elif not self.ranges and not self.chapters:
3297 yield {}
3298
3299 @staticmethod
3300 def _handle_negative_timestamp(time, info):
3301 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3302
3303 def __eq__(self, other):
3304 return (isinstance(other, download_range_func)
3305 and self.chapters == other.chapters and self.ranges == other.ranges)
3306
3307 def __repr__(self):
3308 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3309
3310
3311 def parse_dfxp_time_expr(time_expr):
3312 if not time_expr:
3313 return
3314
3315 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3316 if mobj:
3317 return float(mobj.group('time_offset'))
3318
3319 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3320 if mobj:
3321 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3322
3323
3324 def srt_subtitles_timecode(seconds):
3325 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3326
3327
3328 def ass_subtitles_timecode(seconds):
3329 time = timetuple_from_msec(seconds * 1000)
3330 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3331
3332
3333 def dfxp2srt(dfxp_data):
3334 '''
3335 @param dfxp_data A bytes-like object containing DFXP data
3336 @returns A unicode object containing converted SRT data
3337 '''
3338 LEGACY_NAMESPACES = (
3339 (b'http://www.w3.org/ns/ttml', [
3340 b'http://www.w3.org/2004/11/ttaf1',
3341 b'http://www.w3.org/2006/04/ttaf1',
3342 b'http://www.w3.org/2006/10/ttaf1',
3343 ]),
3344 (b'http://www.w3.org/ns/ttml#styling', [
3345 b'http://www.w3.org/ns/ttml#style',
3346 ]),
3347 )
3348
3349 SUPPORTED_STYLING = [
3350 'color',
3351 'fontFamily',
3352 'fontSize',
3353 'fontStyle',
3354 'fontWeight',
3355 'textDecoration'
3356 ]
3357
3358 _x = functools.partial(xpath_with_ns, ns_map={
3359 'xml': 'http://www.w3.org/XML/1998/namespace',
3360 'ttml': 'http://www.w3.org/ns/ttml',
3361 'tts': 'http://www.w3.org/ns/ttml#styling',
3362 })
3363
3364 styles = {}
3365 default_style = {}
3366
3367 class TTMLPElementParser:
3368 _out = ''
3369 _unclosed_elements = []
3370 _applied_styles = []
3371
3372 def start(self, tag, attrib):
3373 if tag in (_x('ttml:br'), 'br'):
3374 self._out += '\n'
3375 else:
3376 unclosed_elements = []
3377 style = {}
3378 element_style_id = attrib.get('style')
3379 if default_style:
3380 style.update(default_style)
3381 if element_style_id:
3382 style.update(styles.get(element_style_id, {}))
3383 for prop in SUPPORTED_STYLING:
3384 prop_val = attrib.get(_x('tts:' + prop))
3385 if prop_val:
3386 style[prop] = prop_val
3387 if style:
3388 font = ''
3389 for k, v in sorted(style.items()):
3390 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3391 continue
3392 if k == 'color':
3393 font += ' color="%s"' % v
3394 elif k == 'fontSize':
3395 font += ' size="%s"' % v
3396 elif k == 'fontFamily':
3397 font += ' face="%s"' % v
3398 elif k == 'fontWeight' and v == 'bold':
3399 self._out += '<b>'
3400 unclosed_elements.append('b')
3401 elif k == 'fontStyle' and v == 'italic':
3402 self._out += '<i>'
3403 unclosed_elements.append('i')
3404 elif k == 'textDecoration' and v == 'underline':
3405 self._out += '<u>'
3406 unclosed_elements.append('u')
3407 if font:
3408 self._out += '<font' + font + '>'
3409 unclosed_elements.append('font')
3410 applied_style = {}
3411 if self._applied_styles:
3412 applied_style.update(self._applied_styles[-1])
3413 applied_style.update(style)
3414 self._applied_styles.append(applied_style)
3415 self._unclosed_elements.append(unclosed_elements)
3416
3417 def end(self, tag):
3418 if tag not in (_x('ttml:br'), 'br'):
3419 unclosed_elements = self._unclosed_elements.pop()
3420 for element in reversed(unclosed_elements):
3421 self._out += '</%s>' % element
3422 if unclosed_elements and self._applied_styles:
3423 self._applied_styles.pop()
3424
3425 def data(self, data):
3426 self._out += data
3427
3428 def close(self):
3429 return self._out.strip()
3430
3431 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3432 # This will not trigger false positives since only UTF-8 text is being replaced
3433 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3434
3435 def parse_node(node):
3436 target = TTMLPElementParser()
3437 parser = xml.etree.ElementTree.XMLParser(target=target)
3438 parser.feed(xml.etree.ElementTree.tostring(node))
3439 return parser.close()
3440
3441 for k, v in LEGACY_NAMESPACES:
3442 for ns in v:
3443 dfxp_data = dfxp_data.replace(ns, k)
3444
3445 dfxp = compat_etree_fromstring(dfxp_data)
3446 out = []
3447 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3448
3449 if not paras:
3450 raise ValueError('Invalid dfxp/TTML subtitle')
3451
3452 repeat = False
3453 while True:
3454 for style in dfxp.findall(_x('.//ttml:style')):
3455 style_id = style.get('id') or style.get(_x('xml:id'))
3456 if not style_id:
3457 continue
3458 parent_style_id = style.get('style')
3459 if parent_style_id:
3460 if parent_style_id not in styles:
3461 repeat = True
3462 continue
3463 styles[style_id] = styles[parent_style_id].copy()
3464 for prop in SUPPORTED_STYLING:
3465 prop_val = style.get(_x('tts:' + prop))
3466 if prop_val:
3467 styles.setdefault(style_id, {})[prop] = prop_val
3468 if repeat:
3469 repeat = False
3470 else:
3471 break
3472
3473 for p in ('body', 'div'):
3474 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3475 if ele is None:
3476 continue
3477 style = styles.get(ele.get('style'))
3478 if not style:
3479 continue
3480 default_style.update(style)
3481
3482 for para, index in zip(paras, itertools.count(1)):
3483 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3484 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3485 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3486 if begin_time is None:
3487 continue
3488 if not end_time:
3489 if not dur:
3490 continue
3491 end_time = begin_time + dur
3492 out.append('%d\n%s --> %s\n%s\n\n' % (
3493 index,
3494 srt_subtitles_timecode(begin_time),
3495 srt_subtitles_timecode(end_time),
3496 parse_node(para)))
3497
3498 return ''.join(out)
3499
3500
3501 def cli_option(params, command_option, param, separator=None):
3502 param = params.get(param)
3503 return ([] if param is None
3504 else [command_option, str(param)] if separator is None
3505 else [f'{command_option}{separator}{param}'])
3506
3507
3508 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3509 param = params.get(param)
3510 assert param in (True, False, None)
3511 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3512
3513
3514 def cli_valueless_option(params, command_option, param, expected_value=True):
3515 return [command_option] if params.get(param) == expected_value else []
3516
3517
3518 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3519 if isinstance(argdict, (list, tuple)): # for backward compatibility
3520 if use_compat:
3521 return argdict
3522 else:
3523 argdict = None
3524 if argdict is None:
3525 return default
3526 assert isinstance(argdict, dict)
3527
3528 assert isinstance(keys, (list, tuple))
3529 for key_list in keys:
3530 arg_list = list(filter(
3531 lambda x: x is not None,
3532 [argdict.get(key.lower()) for key in variadic(key_list)]))
3533 if arg_list:
3534 return [arg for args in arg_list for arg in args]
3535 return default
3536
3537
3538 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3539 main_key, exe = main_key.lower(), exe.lower()
3540 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3541 keys = [f'{root_key}{k}' for k in (keys or [''])]
3542 if root_key in keys:
3543 if main_key != exe:
3544 keys.append((main_key, exe))
3545 keys.append('default')
3546 else:
3547 use_compat = False
3548 return cli_configuration_args(argdict, keys, default, use_compat)
3549
3550
3551 class ISO639Utils:
3552 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3553 _lang_map = {
3554 'aa': 'aar',
3555 'ab': 'abk',
3556 'ae': 'ave',
3557 'af': 'afr',
3558 'ak': 'aka',
3559 'am': 'amh',
3560 'an': 'arg',
3561 'ar': 'ara',
3562 'as': 'asm',
3563 'av': 'ava',
3564 'ay': 'aym',
3565 'az': 'aze',
3566 'ba': 'bak',
3567 'be': 'bel',
3568 'bg': 'bul',
3569 'bh': 'bih',
3570 'bi': 'bis',
3571 'bm': 'bam',
3572 'bn': 'ben',
3573 'bo': 'bod',
3574 'br': 'bre',
3575 'bs': 'bos',
3576 'ca': 'cat',
3577 'ce': 'che',
3578 'ch': 'cha',
3579 'co': 'cos',
3580 'cr': 'cre',
3581 'cs': 'ces',
3582 'cu': 'chu',
3583 'cv': 'chv',
3584 'cy': 'cym',
3585 'da': 'dan',
3586 'de': 'deu',
3587 'dv': 'div',
3588 'dz': 'dzo',
3589 'ee': 'ewe',
3590 'el': 'ell',
3591 'en': 'eng',
3592 'eo': 'epo',
3593 'es': 'spa',
3594 'et': 'est',
3595 'eu': 'eus',
3596 'fa': 'fas',
3597 'ff': 'ful',
3598 'fi': 'fin',
3599 'fj': 'fij',
3600 'fo': 'fao',
3601 'fr': 'fra',
3602 'fy': 'fry',
3603 'ga': 'gle',
3604 'gd': 'gla',
3605 'gl': 'glg',
3606 'gn': 'grn',
3607 'gu': 'guj',
3608 'gv': 'glv',
3609 'ha': 'hau',
3610 'he': 'heb',
3611 'iw': 'heb', # Replaced by he in 1989 revision
3612 'hi': 'hin',
3613 'ho': 'hmo',
3614 'hr': 'hrv',
3615 'ht': 'hat',
3616 'hu': 'hun',
3617 'hy': 'hye',
3618 'hz': 'her',
3619 'ia': 'ina',
3620 'id': 'ind',
3621 'in': 'ind', # Replaced by id in 1989 revision
3622 'ie': 'ile',
3623 'ig': 'ibo',
3624 'ii': 'iii',
3625 'ik': 'ipk',
3626 'io': 'ido',
3627 'is': 'isl',
3628 'it': 'ita',
3629 'iu': 'iku',
3630 'ja': 'jpn',
3631 'jv': 'jav',
3632 'ka': 'kat',
3633 'kg': 'kon',
3634 'ki': 'kik',
3635 'kj': 'kua',
3636 'kk': 'kaz',
3637 'kl': 'kal',
3638 'km': 'khm',
3639 'kn': 'kan',
3640 'ko': 'kor',
3641 'kr': 'kau',
3642 'ks': 'kas',
3643 'ku': 'kur',
3644 'kv': 'kom',
3645 'kw': 'cor',
3646 'ky': 'kir',
3647 'la': 'lat',
3648 'lb': 'ltz',
3649 'lg': 'lug',
3650 'li': 'lim',
3651 'ln': 'lin',
3652 'lo': 'lao',
3653 'lt': 'lit',
3654 'lu': 'lub',
3655 'lv': 'lav',
3656 'mg': 'mlg',
3657 'mh': 'mah',
3658 'mi': 'mri',
3659 'mk': 'mkd',
3660 'ml': 'mal',
3661 'mn': 'mon',
3662 'mr': 'mar',
3663 'ms': 'msa',
3664 'mt': 'mlt',
3665 'my': 'mya',
3666 'na': 'nau',
3667 'nb': 'nob',
3668 'nd': 'nde',
3669 'ne': 'nep',
3670 'ng': 'ndo',
3671 'nl': 'nld',
3672 'nn': 'nno',
3673 'no': 'nor',
3674 'nr': 'nbl',
3675 'nv': 'nav',
3676 'ny': 'nya',
3677 'oc': 'oci',
3678 'oj': 'oji',
3679 'om': 'orm',
3680 'or': 'ori',
3681 'os': 'oss',
3682 'pa': 'pan',
3683 'pe': 'per',
3684 'pi': 'pli',
3685 'pl': 'pol',
3686 'ps': 'pus',
3687 'pt': 'por',
3688 'qu': 'que',
3689 'rm': 'roh',
3690 'rn': 'run',
3691 'ro': 'ron',
3692 'ru': 'rus',
3693 'rw': 'kin',
3694 'sa': 'san',
3695 'sc': 'srd',
3696 'sd': 'snd',
3697 'se': 'sme',
3698 'sg': 'sag',
3699 'si': 'sin',
3700 'sk': 'slk',
3701 'sl': 'slv',
3702 'sm': 'smo',
3703 'sn': 'sna',
3704 'so': 'som',
3705 'sq': 'sqi',
3706 'sr': 'srp',
3707 'ss': 'ssw',
3708 'st': 'sot',
3709 'su': 'sun',
3710 'sv': 'swe',
3711 'sw': 'swa',
3712 'ta': 'tam',
3713 'te': 'tel',
3714 'tg': 'tgk',
3715 'th': 'tha',
3716 'ti': 'tir',
3717 'tk': 'tuk',
3718 'tl': 'tgl',
3719 'tn': 'tsn',
3720 'to': 'ton',
3721 'tr': 'tur',
3722 'ts': 'tso',
3723 'tt': 'tat',
3724 'tw': 'twi',
3725 'ty': 'tah',
3726 'ug': 'uig',
3727 'uk': 'ukr',
3728 'ur': 'urd',
3729 'uz': 'uzb',
3730 've': 'ven',
3731 'vi': 'vie',
3732 'vo': 'vol',
3733 'wa': 'wln',
3734 'wo': 'wol',
3735 'xh': 'xho',
3736 'yi': 'yid',
3737 'ji': 'yid', # Replaced by yi in 1989 revision
3738 'yo': 'yor',
3739 'za': 'zha',
3740 'zh': 'zho',
3741 'zu': 'zul',
3742 }
3743
3744 @classmethod
3745 def short2long(cls, code):
3746 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3747 return cls._lang_map.get(code[:2])
3748
3749 @classmethod
3750 def long2short(cls, code):
3751 """Convert language code from ISO 639-2/T to ISO 639-1"""
3752 for short_name, long_name in cls._lang_map.items():
3753 if long_name == code:
3754 return short_name
3755
3756
3757 class ISO3166Utils:
3758 # From http://data.okfn.org/data/core/country-list
3759 _country_map = {
3760 'AF': 'Afghanistan',
3761 'AX': 'Åland Islands',
3762 'AL': 'Albania',
3763 'DZ': 'Algeria',
3764 'AS': 'American Samoa',
3765 'AD': 'Andorra',
3766 'AO': 'Angola',
3767 'AI': 'Anguilla',
3768 'AQ': 'Antarctica',
3769 'AG': 'Antigua and Barbuda',
3770 'AR': 'Argentina',
3771 'AM': 'Armenia',
3772 'AW': 'Aruba',
3773 'AU': 'Australia',
3774 'AT': 'Austria',
3775 'AZ': 'Azerbaijan',
3776 'BS': 'Bahamas',
3777 'BH': 'Bahrain',
3778 'BD': 'Bangladesh',
3779 'BB': 'Barbados',
3780 'BY': 'Belarus',
3781 'BE': 'Belgium',
3782 'BZ': 'Belize',
3783 'BJ': 'Benin',
3784 'BM': 'Bermuda',
3785 'BT': 'Bhutan',
3786 'BO': 'Bolivia, Plurinational State of',
3787 'BQ': 'Bonaire, Sint Eustatius and Saba',
3788 'BA': 'Bosnia and Herzegovina',
3789 'BW': 'Botswana',
3790 'BV': 'Bouvet Island',
3791 'BR': 'Brazil',
3792 'IO': 'British Indian Ocean Territory',
3793 'BN': 'Brunei Darussalam',
3794 'BG': 'Bulgaria',
3795 'BF': 'Burkina Faso',
3796 'BI': 'Burundi',
3797 'KH': 'Cambodia',
3798 'CM': 'Cameroon',
3799 'CA': 'Canada',
3800 'CV': 'Cape Verde',
3801 'KY': 'Cayman Islands',
3802 'CF': 'Central African Republic',
3803 'TD': 'Chad',
3804 'CL': 'Chile',
3805 'CN': 'China',
3806 'CX': 'Christmas Island',
3807 'CC': 'Cocos (Keeling) Islands',
3808 'CO': 'Colombia',
3809 'KM': 'Comoros',
3810 'CG': 'Congo',
3811 'CD': 'Congo, the Democratic Republic of the',
3812 'CK': 'Cook Islands',
3813 'CR': 'Costa Rica',
3814 'CI': 'Côte d\'Ivoire',
3815 'HR': 'Croatia',
3816 'CU': 'Cuba',
3817 'CW': 'Curaçao',
3818 'CY': 'Cyprus',
3819 'CZ': 'Czech Republic',
3820 'DK': 'Denmark',
3821 'DJ': 'Djibouti',
3822 'DM': 'Dominica',
3823 'DO': 'Dominican Republic',
3824 'EC': 'Ecuador',
3825 'EG': 'Egypt',
3826 'SV': 'El Salvador',
3827 'GQ': 'Equatorial Guinea',
3828 'ER': 'Eritrea',
3829 'EE': 'Estonia',
3830 'ET': 'Ethiopia',
3831 'FK': 'Falkland Islands (Malvinas)',
3832 'FO': 'Faroe Islands',
3833 'FJ': 'Fiji',
3834 'FI': 'Finland',
3835 'FR': 'France',
3836 'GF': 'French Guiana',
3837 'PF': 'French Polynesia',
3838 'TF': 'French Southern Territories',
3839 'GA': 'Gabon',
3840 'GM': 'Gambia',
3841 'GE': 'Georgia',
3842 'DE': 'Germany',
3843 'GH': 'Ghana',
3844 'GI': 'Gibraltar',
3845 'GR': 'Greece',
3846 'GL': 'Greenland',
3847 'GD': 'Grenada',
3848 'GP': 'Guadeloupe',
3849 'GU': 'Guam',
3850 'GT': 'Guatemala',
3851 'GG': 'Guernsey',
3852 'GN': 'Guinea',
3853 'GW': 'Guinea-Bissau',
3854 'GY': 'Guyana',
3855 'HT': 'Haiti',
3856 'HM': 'Heard Island and McDonald Islands',
3857 'VA': 'Holy See (Vatican City State)',
3858 'HN': 'Honduras',
3859 'HK': 'Hong Kong',
3860 'HU': 'Hungary',
3861 'IS': 'Iceland',
3862 'IN': 'India',
3863 'ID': 'Indonesia',
3864 'IR': 'Iran, Islamic Republic of',
3865 'IQ': 'Iraq',
3866 'IE': 'Ireland',
3867 'IM': 'Isle of Man',
3868 'IL': 'Israel',
3869 'IT': 'Italy',
3870 'JM': 'Jamaica',
3871 'JP': 'Japan',
3872 'JE': 'Jersey',
3873 'JO': 'Jordan',
3874 'KZ': 'Kazakhstan',
3875 'KE': 'Kenya',
3876 'KI': 'Kiribati',
3877 'KP': 'Korea, Democratic People\'s Republic of',
3878 'KR': 'Korea, Republic of',
3879 'KW': 'Kuwait',
3880 'KG': 'Kyrgyzstan',
3881 'LA': 'Lao People\'s Democratic Republic',
3882 'LV': 'Latvia',
3883 'LB': 'Lebanon',
3884 'LS': 'Lesotho',
3885 'LR': 'Liberia',
3886 'LY': 'Libya',
3887 'LI': 'Liechtenstein',
3888 'LT': 'Lithuania',
3889 'LU': 'Luxembourg',
3890 'MO': 'Macao',
3891 'MK': 'Macedonia, the Former Yugoslav Republic of',
3892 'MG': 'Madagascar',
3893 'MW': 'Malawi',
3894 'MY': 'Malaysia',
3895 'MV': 'Maldives',
3896 'ML': 'Mali',
3897 'MT': 'Malta',
3898 'MH': 'Marshall Islands',
3899 'MQ': 'Martinique',
3900 'MR': 'Mauritania',
3901 'MU': 'Mauritius',
3902 'YT': 'Mayotte',
3903 'MX': 'Mexico',
3904 'FM': 'Micronesia, Federated States of',
3905 'MD': 'Moldova, Republic of',
3906 'MC': 'Monaco',
3907 'MN': 'Mongolia',
3908 'ME': 'Montenegro',
3909 'MS': 'Montserrat',
3910 'MA': 'Morocco',
3911 'MZ': 'Mozambique',
3912 'MM': 'Myanmar',
3913 'NA': 'Namibia',
3914 'NR': 'Nauru',
3915 'NP': 'Nepal',
3916 'NL': 'Netherlands',
3917 'NC': 'New Caledonia',
3918 'NZ': 'New Zealand',
3919 'NI': 'Nicaragua',
3920 'NE': 'Niger',
3921 'NG': 'Nigeria',
3922 'NU': 'Niue',
3923 'NF': 'Norfolk Island',
3924 'MP': 'Northern Mariana Islands',
3925 'NO': 'Norway',
3926 'OM': 'Oman',
3927 'PK': 'Pakistan',
3928 'PW': 'Palau',
3929 'PS': 'Palestine, State of',
3930 'PA': 'Panama',
3931 'PG': 'Papua New Guinea',
3932 'PY': 'Paraguay',
3933 'PE': 'Peru',
3934 'PH': 'Philippines',
3935 'PN': 'Pitcairn',
3936 'PL': 'Poland',
3937 'PT': 'Portugal',
3938 'PR': 'Puerto Rico',
3939 'QA': 'Qatar',
3940 'RE': 'Réunion',
3941 'RO': 'Romania',
3942 'RU': 'Russian Federation',
3943 'RW': 'Rwanda',
3944 'BL': 'Saint Barthélemy',
3945 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3946 'KN': 'Saint Kitts and Nevis',
3947 'LC': 'Saint Lucia',
3948 'MF': 'Saint Martin (French part)',
3949 'PM': 'Saint Pierre and Miquelon',
3950 'VC': 'Saint Vincent and the Grenadines',
3951 'WS': 'Samoa',
3952 'SM': 'San Marino',
3953 'ST': 'Sao Tome and Principe',
3954 'SA': 'Saudi Arabia',
3955 'SN': 'Senegal',
3956 'RS': 'Serbia',
3957 'SC': 'Seychelles',
3958 'SL': 'Sierra Leone',
3959 'SG': 'Singapore',
3960 'SX': 'Sint Maarten (Dutch part)',
3961 'SK': 'Slovakia',
3962 'SI': 'Slovenia',
3963 'SB': 'Solomon Islands',
3964 'SO': 'Somalia',
3965 'ZA': 'South Africa',
3966 'GS': 'South Georgia and the South Sandwich Islands',
3967 'SS': 'South Sudan',
3968 'ES': 'Spain',
3969 'LK': 'Sri Lanka',
3970 'SD': 'Sudan',
3971 'SR': 'Suriname',
3972 'SJ': 'Svalbard and Jan Mayen',
3973 'SZ': 'Swaziland',
3974 'SE': 'Sweden',
3975 'CH': 'Switzerland',
3976 'SY': 'Syrian Arab Republic',
3977 'TW': 'Taiwan, Province of China',
3978 'TJ': 'Tajikistan',
3979 'TZ': 'Tanzania, United Republic of',
3980 'TH': 'Thailand',
3981 'TL': 'Timor-Leste',
3982 'TG': 'Togo',
3983 'TK': 'Tokelau',
3984 'TO': 'Tonga',
3985 'TT': 'Trinidad and Tobago',
3986 'TN': 'Tunisia',
3987 'TR': 'Turkey',
3988 'TM': 'Turkmenistan',
3989 'TC': 'Turks and Caicos Islands',
3990 'TV': 'Tuvalu',
3991 'UG': 'Uganda',
3992 'UA': 'Ukraine',
3993 'AE': 'United Arab Emirates',
3994 'GB': 'United Kingdom',
3995 'US': 'United States',
3996 'UM': 'United States Minor Outlying Islands',
3997 'UY': 'Uruguay',
3998 'UZ': 'Uzbekistan',
3999 'VU': 'Vanuatu',
4000 'VE': 'Venezuela, Bolivarian Republic of',
4001 'VN': 'Viet Nam',
4002 'VG': 'Virgin Islands, British',
4003 'VI': 'Virgin Islands, U.S.',
4004 'WF': 'Wallis and Futuna',
4005 'EH': 'Western Sahara',
4006 'YE': 'Yemen',
4007 'ZM': 'Zambia',
4008 'ZW': 'Zimbabwe',
4009 # Not ISO 3166 codes, but used for IP blocks
4010 'AP': 'Asia/Pacific Region',
4011 'EU': 'Europe',
4012 }
4013
4014 @classmethod
4015 def short2full(cls, code):
4016 """Convert an ISO 3166-2 country code to the corresponding full name"""
4017 return cls._country_map.get(code.upper())
4018
4019
4020 class GeoUtils:
4021 # Major IPv4 address blocks per country
4022 _country_ip_map = {
4023 'AD': '46.172.224.0/19',
4024 'AE': '94.200.0.0/13',
4025 'AF': '149.54.0.0/17',
4026 'AG': '209.59.64.0/18',
4027 'AI': '204.14.248.0/21',
4028 'AL': '46.99.0.0/16',
4029 'AM': '46.70.0.0/15',
4030 'AO': '105.168.0.0/13',
4031 'AP': '182.50.184.0/21',
4032 'AQ': '23.154.160.0/24',
4033 'AR': '181.0.0.0/12',
4034 'AS': '202.70.112.0/20',
4035 'AT': '77.116.0.0/14',
4036 'AU': '1.128.0.0/11',
4037 'AW': '181.41.0.0/18',
4038 'AX': '185.217.4.0/22',
4039 'AZ': '5.197.0.0/16',
4040 'BA': '31.176.128.0/17',
4041 'BB': '65.48.128.0/17',
4042 'BD': '114.130.0.0/16',
4043 'BE': '57.0.0.0/8',
4044 'BF': '102.178.0.0/15',
4045 'BG': '95.42.0.0/15',
4046 'BH': '37.131.0.0/17',
4047 'BI': '154.117.192.0/18',
4048 'BJ': '137.255.0.0/16',
4049 'BL': '185.212.72.0/23',
4050 'BM': '196.12.64.0/18',
4051 'BN': '156.31.0.0/16',
4052 'BO': '161.56.0.0/16',
4053 'BQ': '161.0.80.0/20',
4054 'BR': '191.128.0.0/12',
4055 'BS': '24.51.64.0/18',
4056 'BT': '119.2.96.0/19',
4057 'BW': '168.167.0.0/16',
4058 'BY': '178.120.0.0/13',
4059 'BZ': '179.42.192.0/18',
4060 'CA': '99.224.0.0/11',
4061 'CD': '41.243.0.0/16',
4062 'CF': '197.242.176.0/21',
4063 'CG': '160.113.0.0/16',
4064 'CH': '85.0.0.0/13',
4065 'CI': '102.136.0.0/14',
4066 'CK': '202.65.32.0/19',
4067 'CL': '152.172.0.0/14',
4068 'CM': '102.244.0.0/14',
4069 'CN': '36.128.0.0/10',
4070 'CO': '181.240.0.0/12',
4071 'CR': '201.192.0.0/12',
4072 'CU': '152.206.0.0/15',
4073 'CV': '165.90.96.0/19',
4074 'CW': '190.88.128.0/17',
4075 'CY': '31.153.0.0/16',
4076 'CZ': '88.100.0.0/14',
4077 'DE': '53.0.0.0/8',
4078 'DJ': '197.241.0.0/17',
4079 'DK': '87.48.0.0/12',
4080 'DM': '192.243.48.0/20',
4081 'DO': '152.166.0.0/15',
4082 'DZ': '41.96.0.0/12',
4083 'EC': '186.68.0.0/15',
4084 'EE': '90.190.0.0/15',
4085 'EG': '156.160.0.0/11',
4086 'ER': '196.200.96.0/20',
4087 'ES': '88.0.0.0/11',
4088 'ET': '196.188.0.0/14',
4089 'EU': '2.16.0.0/13',
4090 'FI': '91.152.0.0/13',
4091 'FJ': '144.120.0.0/16',
4092 'FK': '80.73.208.0/21',
4093 'FM': '119.252.112.0/20',
4094 'FO': '88.85.32.0/19',
4095 'FR': '90.0.0.0/9',
4096 'GA': '41.158.0.0/15',
4097 'GB': '25.0.0.0/8',
4098 'GD': '74.122.88.0/21',
4099 'GE': '31.146.0.0/16',
4100 'GF': '161.22.64.0/18',
4101 'GG': '62.68.160.0/19',
4102 'GH': '154.160.0.0/12',
4103 'GI': '95.164.0.0/16',
4104 'GL': '88.83.0.0/19',
4105 'GM': '160.182.0.0/15',
4106 'GN': '197.149.192.0/18',
4107 'GP': '104.250.0.0/19',
4108 'GQ': '105.235.224.0/20',
4109 'GR': '94.64.0.0/13',
4110 'GT': '168.234.0.0/16',
4111 'GU': '168.123.0.0/16',
4112 'GW': '197.214.80.0/20',
4113 'GY': '181.41.64.0/18',
4114 'HK': '113.252.0.0/14',
4115 'HN': '181.210.0.0/16',
4116 'HR': '93.136.0.0/13',
4117 'HT': '148.102.128.0/17',
4118 'HU': '84.0.0.0/14',
4119 'ID': '39.192.0.0/10',
4120 'IE': '87.32.0.0/12',
4121 'IL': '79.176.0.0/13',
4122 'IM': '5.62.80.0/20',
4123 'IN': '117.192.0.0/10',
4124 'IO': '203.83.48.0/21',
4125 'IQ': '37.236.0.0/14',
4126 'IR': '2.176.0.0/12',
4127 'IS': '82.221.0.0/16',
4128 'IT': '79.0.0.0/10',
4129 'JE': '87.244.64.0/18',
4130 'JM': '72.27.0.0/17',
4131 'JO': '176.29.0.0/16',
4132 'JP': '133.0.0.0/8',
4133 'KE': '105.48.0.0/12',
4134 'KG': '158.181.128.0/17',
4135 'KH': '36.37.128.0/17',
4136 'KI': '103.25.140.0/22',
4137 'KM': '197.255.224.0/20',
4138 'KN': '198.167.192.0/19',
4139 'KP': '175.45.176.0/22',
4140 'KR': '175.192.0.0/10',
4141 'KW': '37.36.0.0/14',
4142 'KY': '64.96.0.0/15',
4143 'KZ': '2.72.0.0/13',
4144 'LA': '115.84.64.0/18',
4145 'LB': '178.135.0.0/16',
4146 'LC': '24.92.144.0/20',
4147 'LI': '82.117.0.0/19',
4148 'LK': '112.134.0.0/15',
4149 'LR': '102.183.0.0/16',
4150 'LS': '129.232.0.0/17',
4151 'LT': '78.56.0.0/13',
4152 'LU': '188.42.0.0/16',
4153 'LV': '46.109.0.0/16',
4154 'LY': '41.252.0.0/14',
4155 'MA': '105.128.0.0/11',
4156 'MC': '88.209.64.0/18',
4157 'MD': '37.246.0.0/16',
4158 'ME': '178.175.0.0/17',
4159 'MF': '74.112.232.0/21',
4160 'MG': '154.126.0.0/17',
4161 'MH': '117.103.88.0/21',
4162 'MK': '77.28.0.0/15',
4163 'ML': '154.118.128.0/18',
4164 'MM': '37.111.0.0/17',
4165 'MN': '49.0.128.0/17',
4166 'MO': '60.246.0.0/16',
4167 'MP': '202.88.64.0/20',
4168 'MQ': '109.203.224.0/19',
4169 'MR': '41.188.64.0/18',
4170 'MS': '208.90.112.0/22',
4171 'MT': '46.11.0.0/16',
4172 'MU': '105.16.0.0/12',
4173 'MV': '27.114.128.0/18',
4174 'MW': '102.70.0.0/15',
4175 'MX': '187.192.0.0/11',
4176 'MY': '175.136.0.0/13',
4177 'MZ': '197.218.0.0/15',
4178 'NA': '41.182.0.0/16',
4179 'NC': '101.101.0.0/18',
4180 'NE': '197.214.0.0/18',
4181 'NF': '203.17.240.0/22',
4182 'NG': '105.112.0.0/12',
4183 'NI': '186.76.0.0/15',
4184 'NL': '145.96.0.0/11',
4185 'NO': '84.208.0.0/13',
4186 'NP': '36.252.0.0/15',
4187 'NR': '203.98.224.0/19',
4188 'NU': '49.156.48.0/22',
4189 'NZ': '49.224.0.0/14',
4190 'OM': '5.36.0.0/15',
4191 'PA': '186.72.0.0/15',
4192 'PE': '186.160.0.0/14',
4193 'PF': '123.50.64.0/18',
4194 'PG': '124.240.192.0/19',
4195 'PH': '49.144.0.0/13',
4196 'PK': '39.32.0.0/11',
4197 'PL': '83.0.0.0/11',
4198 'PM': '70.36.0.0/20',
4199 'PR': '66.50.0.0/16',
4200 'PS': '188.161.0.0/16',
4201 'PT': '85.240.0.0/13',
4202 'PW': '202.124.224.0/20',
4203 'PY': '181.120.0.0/14',
4204 'QA': '37.210.0.0/15',
4205 'RE': '102.35.0.0/16',
4206 'RO': '79.112.0.0/13',
4207 'RS': '93.86.0.0/15',
4208 'RU': '5.136.0.0/13',
4209 'RW': '41.186.0.0/16',
4210 'SA': '188.48.0.0/13',
4211 'SB': '202.1.160.0/19',
4212 'SC': '154.192.0.0/11',
4213 'SD': '102.120.0.0/13',
4214 'SE': '78.64.0.0/12',
4215 'SG': '8.128.0.0/10',
4216 'SI': '188.196.0.0/14',
4217 'SK': '78.98.0.0/15',
4218 'SL': '102.143.0.0/17',
4219 'SM': '89.186.32.0/19',
4220 'SN': '41.82.0.0/15',
4221 'SO': '154.115.192.0/18',
4222 'SR': '186.179.128.0/17',
4223 'SS': '105.235.208.0/21',
4224 'ST': '197.159.160.0/19',
4225 'SV': '168.243.0.0/16',
4226 'SX': '190.102.0.0/20',
4227 'SY': '5.0.0.0/16',
4228 'SZ': '41.84.224.0/19',
4229 'TC': '65.255.48.0/20',
4230 'TD': '154.68.128.0/19',
4231 'TG': '196.168.0.0/14',
4232 'TH': '171.96.0.0/13',
4233 'TJ': '85.9.128.0/18',
4234 'TK': '27.96.24.0/21',
4235 'TL': '180.189.160.0/20',
4236 'TM': '95.85.96.0/19',
4237 'TN': '197.0.0.0/11',
4238 'TO': '175.176.144.0/21',
4239 'TR': '78.160.0.0/11',
4240 'TT': '186.44.0.0/15',
4241 'TV': '202.2.96.0/19',
4242 'TW': '120.96.0.0/11',
4243 'TZ': '156.156.0.0/14',
4244 'UA': '37.52.0.0/14',
4245 'UG': '102.80.0.0/13',
4246 'US': '6.0.0.0/8',
4247 'UY': '167.56.0.0/13',
4248 'UZ': '84.54.64.0/18',
4249 'VA': '212.77.0.0/19',
4250 'VC': '207.191.240.0/21',
4251 'VE': '186.88.0.0/13',
4252 'VG': '66.81.192.0/20',
4253 'VI': '146.226.0.0/16',
4254 'VN': '14.160.0.0/11',
4255 'VU': '202.80.32.0/20',
4256 'WF': '117.20.32.0/21',
4257 'WS': '202.4.32.0/19',
4258 'YE': '134.35.0.0/16',
4259 'YT': '41.242.116.0/22',
4260 'ZA': '41.0.0.0/11',
4261 'ZM': '102.144.0.0/13',
4262 'ZW': '102.177.192.0/18',
4263 }
4264
4265 @classmethod
4266 def random_ipv4(cls, code_or_block):
4267 if len(code_or_block) == 2:
4268 block = cls._country_ip_map.get(code_or_block.upper())
4269 if not block:
4270 return None
4271 else:
4272 block = code_or_block
4273 addr, preflen = block.split('/')
4274 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4275 addr_max = addr_min | (0xffffffff >> int(preflen))
4276 return str(socket.inet_ntoa(
4277 struct.pack('!L', random.randint(addr_min, addr_max))))
4278
4279
4280 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4281 # released into Public Domain
4282 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4283
4284 def long_to_bytes(n, blocksize=0):
4285 """long_to_bytes(n:long, blocksize:int) : string
4286 Convert a long integer to a byte string.
4287
4288 If optional blocksize is given and greater than zero, pad the front of the
4289 byte string with binary zeros so that the length is a multiple of
4290 blocksize.
4291 """
4292 # after much testing, this algorithm was deemed to be the fastest
4293 s = b''
4294 n = int(n)
4295 while n > 0:
4296 s = struct.pack('>I', n & 0xffffffff) + s
4297 n = n >> 32
4298 # strip off leading zeros
4299 for i in range(len(s)):
4300 if s[i] != b'\000'[0]:
4301 break
4302 else:
4303 # only happens when n == 0
4304 s = b'\000'
4305 i = 0
4306 s = s[i:]
4307 # add back some pad bytes. this could be done more efficiently w.r.t. the
4308 # de-padding being done above, but sigh...
4309 if blocksize > 0 and len(s) % blocksize:
4310 s = (blocksize - len(s) % blocksize) * b'\000' + s
4311 return s
4312
4313
4314 def bytes_to_long(s):
4315 """bytes_to_long(string) : long
4316 Convert a byte string to a long integer.
4317
4318 This is (essentially) the inverse of long_to_bytes().
4319 """
4320 acc = 0
4321 length = len(s)
4322 if length % 4:
4323 extra = (4 - length % 4)
4324 s = b'\000' * extra + s
4325 length = length + extra
4326 for i in range(0, length, 4):
4327 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4328 return acc
4329
4330
4331 def ohdave_rsa_encrypt(data, exponent, modulus):
4332 '''
4333 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4334
4335 Input:
4336 data: data to encrypt, bytes-like object
4337 exponent, modulus: parameter e and N of RSA algorithm, both integer
4338 Output: hex string of encrypted data
4339
4340 Limitation: supports one block encryption only
4341 '''
4342
4343 payload = int(binascii.hexlify(data[::-1]), 16)
4344 encrypted = pow(payload, exponent, modulus)
4345 return '%x' % encrypted
4346
4347
4348 def pkcs1pad(data, length):
4349 """
4350 Padding input data with PKCS#1 scheme
4351
4352 @param {int[]} data input data
4353 @param {int} length target length
4354 @returns {int[]} padded data
4355 """
4356 if len(data) > length - 11:
4357 raise ValueError('Input data too long for PKCS#1 padding')
4358
4359 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4360 return [0, 2] + pseudo_random + [0] + data
4361
4362
4363 def _base_n_table(n, table):
4364 if not table and not n:
4365 raise ValueError('Either table or n must be specified')
4366 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4367
4368 if n and n != len(table):
4369 raise ValueError(f'base {n} exceeds table length {len(table)}')
4370 return table
4371
4372
4373 def encode_base_n(num, n=None, table=None):
4374 """Convert given int to a base-n string"""
4375 table = _base_n_table(n, table)
4376 if not num:
4377 return table[0]
4378
4379 result, base = '', len(table)
4380 while num:
4381 result = table[num % base] + result
4382 num = num // base
4383 return result
4384
4385
4386 def decode_base_n(string, n=None, table=None):
4387 """Convert given base-n string to int"""
4388 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4389 result, base = 0, len(table)
4390 for char in string:
4391 result = result * base + table[char]
4392 return result
4393
4394
4395 def decode_packed_codes(code):
4396 mobj = re.search(PACKED_CODES_RE, code)
4397 obfuscated_code, base, count, symbols = mobj.groups()
4398 base = int(base)
4399 count = int(count)
4400 symbols = symbols.split('|')
4401 symbol_table = {}
4402
4403 while count:
4404 count -= 1
4405 base_n_count = encode_base_n(count, base)
4406 symbol_table[base_n_count] = symbols[count] or base_n_count
4407
4408 return re.sub(
4409 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4410 obfuscated_code)
4411
4412
4413 def caesar(s, alphabet, shift):
4414 if shift == 0:
4415 return s
4416 l = len(alphabet)
4417 return ''.join(
4418 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4419 for c in s)
4420
4421
4422 def rot47(s):
4423 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4424
4425
4426 def parse_m3u8_attributes(attrib):
4427 info = {}
4428 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4429 if val.startswith('"'):
4430 val = val[1:-1]
4431 info[key] = val
4432 return info
4433
4434
4435 def urshift(val, n):
4436 return val >> n if val >= 0 else (val + 0x100000000) >> n
4437
4438
4439 def write_xattr(path, key, value):
4440 # Windows: Write xattrs to NTFS Alternate Data Streams:
4441 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4442 if compat_os_name == 'nt':
4443 assert ':' not in key
4444 assert os.path.exists(path)
4445
4446 try:
4447 with open(f'{path}:{key}', 'wb') as f:
4448 f.write(value)
4449 except OSError as e:
4450 raise XAttrMetadataError(e.errno, e.strerror)
4451 return
4452
4453 # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4454
4455 setxattr = None
4456 if callable(getattr(os, 'setxattr', None)):
4457 setxattr = os.setxattr
4458 elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4459 # Unicode arguments are not supported in pyxattr until version 0.5.0
4460 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4461 if version_tuple(xattr.__version__) >= (0, 5, 0):
4462 setxattr = xattr.set
4463 elif xattr:
4464 setxattr = xattr.setxattr
4465
4466 if setxattr:
4467 try:
4468 setxattr(path, key, value)
4469 except OSError as e:
4470 raise XAttrMetadataError(e.errno, e.strerror)
4471 return
4472
4473 # UNIX Method 2. Use setfattr/xattr executables
4474 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4475 else 'xattr' if check_executable('xattr', ['-h']) else None)
4476 if not exe:
4477 raise XAttrUnavailableError(
4478 'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
4479 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4480
4481 value = value.decode()
4482 try:
4483 _, stderr, returncode = Popen.run(
4484 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4485 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4486 except OSError as e:
4487 raise XAttrMetadataError(e.errno, e.strerror)
4488 if returncode:
4489 raise XAttrMetadataError(returncode, stderr)
4490
4491
4492 def random_birthday(year_field, month_field, day_field):
4493 start_date = datetime.date(1950, 1, 1)
4494 end_date = datetime.date(1995, 12, 31)
4495 offset = random.randint(0, (end_date - start_date).days)
4496 random_date = start_date + datetime.timedelta(offset)
4497 return {
4498 year_field: str(random_date.year),
4499 month_field: str(random_date.month),
4500 day_field: str(random_date.day),
4501 }
4502
4503
4504 def find_available_port(interface=''):
4505 try:
4506 with socket.socket() as sock:
4507 sock.bind((interface, 0))
4508 return sock.getsockname()[1]
4509 except OSError:
4510 return None
4511
4512
4513 # Templates for internet shortcut files, which are plain text files.
4514 DOT_URL_LINK_TEMPLATE = '''\
4515 [InternetShortcut]
4516 URL=%(url)s
4517 '''
4518
4519 DOT_WEBLOC_LINK_TEMPLATE = '''\
4520 <?xml version="1.0" encoding="UTF-8"?>
4521 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4522 <plist version="1.0">
4523 <dict>
4524 \t<key>URL</key>
4525 \t<string>%(url)s</string>
4526 </dict>
4527 </plist>
4528 '''
4529
4530 DOT_DESKTOP_LINK_TEMPLATE = '''\
4531 [Desktop Entry]
4532 Encoding=UTF-8
4533 Name=%(filename)s
4534 Type=Link
4535 URL=%(url)s
4536 Icon=text-html
4537 '''
4538
4539 LINK_TEMPLATES = {
4540 'url': DOT_URL_LINK_TEMPLATE,
4541 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4542 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4543 }
4544
4545
4546 def iri_to_uri(iri):
4547 """
4548 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4549
4550 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4551 """
4552
4553 iri_parts = urllib.parse.urlparse(iri)
4554
4555 if '[' in iri_parts.netloc:
4556 raise ValueError('IPv6 URIs are not, yet, supported.')
4557 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4558
4559 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4560
4561 net_location = ''
4562 if iri_parts.username:
4563 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4564 if iri_parts.password is not None:
4565 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4566 net_location += '@'
4567
4568 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
4569 # The 'idna' encoding produces ASCII text.
4570 if iri_parts.port is not None and iri_parts.port != 80:
4571 net_location += ':' + str(iri_parts.port)
4572
4573 return urllib.parse.urlunparse(
4574 (iri_parts.scheme,
4575 net_location,
4576
4577 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4578
4579 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4580 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4581
4582 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4583 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4584
4585 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4586
4587 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4588
4589
4590 def to_high_limit_path(path):
4591 if sys.platform in ['win32', 'cygwin']:
4592 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4593 return '\\\\?\\' + os.path.abspath(path)
4594
4595 return path
4596
4597
4598 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4599 val = traversal.traverse_obj(obj, *variadic(field))
4600 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4601 return default
4602 return template % func(val)
4603
4604
4605 def clean_podcast_url(url):
4606 url = re.sub(r'''(?x)
4607 (?:
4608 (?:
4609 chtbl\.com/track|
4610 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4611 play\.podtrac\.com|
4612 chrt\.fm/track|
4613 mgln\.ai/e
4614 )(?:/[^/.]+)?|
4615 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4616 flex\.acast\.com|
4617 pd(?:
4618 cn\.co| # https://podcorn.com/analytics-prefix/
4619 st\.fm # https://podsights.com/docs/
4620 )/e|
4621 [0-9]\.gum\.fm|
4622 pscrb\.fm/rss/p
4623 )/''', '', url)
4624 return re.sub(r'^\w+://(\w+://)', r'\1', url)
4625
4626
4627 _HEX_TABLE = '0123456789abcdef'
4628
4629
4630 def random_uuidv4():
4631 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4632
4633
4634 def make_dir(path, to_screen=None):
4635 try:
4636 dn = os.path.dirname(path)
4637 if dn:
4638 os.makedirs(dn, exist_ok=True)
4639 return True
4640 except OSError as err:
4641 if callable(to_screen) is not None:
4642 to_screen(f'unable to create directory {err}')
4643 return False
4644
4645
4646 def get_executable_path():
4647 from ..update import _get_variant_and_executable_path
4648
4649 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4650
4651
4652 def get_user_config_dirs(package_name):
4653 # .config (e.g. ~/.config/package_name)
4654 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4655 yield os.path.join(xdg_config_home, package_name)
4656
4657 # appdata (%APPDATA%/package_name)
4658 appdata_dir = os.getenv('appdata')
4659 if appdata_dir:
4660 yield os.path.join(appdata_dir, package_name)
4661
4662 # home (~/.package_name)
4663 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4664
4665
4666 def get_system_config_dirs(package_name):
4667 # /etc/package_name
4668 yield os.path.join('/etc', package_name)
4669
4670
4671 def time_seconds(**kwargs):
4672 """
4673 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4674 """
4675 return time.time() + datetime.timedelta(**kwargs).total_seconds()
4676
4677
4678 # create a JSON Web Signature (jws) with HS256 algorithm
4679 # the resulting format is in JWS Compact Serialization
4680 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4681 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4682 def jwt_encode_hs256(payload_data, key, headers={}):
4683 header_data = {
4684 'alg': 'HS256',
4685 'typ': 'JWT',
4686 }
4687 if headers:
4688 header_data.update(headers)
4689 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4690 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4691 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4692 signature_b64 = base64.b64encode(h.digest())
4693 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4694 return token
4695
4696
4697 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4698 def jwt_decode_hs256(jwt):
4699 header_b64, payload_b64, signature_b64 = jwt.split('.')
4700 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4701 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4702 return payload_data
4703
4704
4705 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4706
4707
4708 @functools.cache
4709 def supports_terminal_sequences(stream):
4710 if compat_os_name == 'nt':
4711 if not WINDOWS_VT_MODE:
4712 return False
4713 elif not os.getenv('TERM'):
4714 return False
4715 try:
4716 return stream.isatty()
4717 except BaseException:
4718 return False
4719
4720
4721 def windows_enable_vt_mode():
4722 """Ref: https://bugs.python.org/issue30075 """
4723 if get_windows_version() < (10, 0, 10586):
4724 return
4725
4726 import ctypes
4727 import ctypes.wintypes
4728 import msvcrt
4729
4730 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4731
4732 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4733 handle = os.open('CONOUT$', os.O_RDWR)
4734 try:
4735 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4736 dw_original_mode = ctypes.wintypes.DWORD()
4737 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4738 if not success:
4739 raise Exception('GetConsoleMode failed')
4740
4741 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4742 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4743 if not success:
4744 raise Exception('SetConsoleMode failed')
4745 finally:
4746 os.close(handle)
4747
4748 global WINDOWS_VT_MODE
4749 WINDOWS_VT_MODE = True
4750 supports_terminal_sequences.cache_clear()
4751
4752
4753 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4754
4755
4756 def remove_terminal_sequences(string):
4757 return _terminal_sequences_re.sub('', string)
4758
4759
4760 def number_of_digits(number):
4761 return len('%d' % number)
4762
4763
4764 def join_nonempty(*values, delim='-', from_dict=None):
4765 if from_dict is not None:
4766 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4767 return delim.join(map(str, filter(None, values)))
4768
4769
4770 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4771 """
4772 Find the largest format dimensions in terms of video width and, for each thumbnail:
4773 * Modify the URL: Match the width with the provided regex and replace with the former width
4774 * Update dimensions
4775
4776 This function is useful with video services that scale the provided thumbnails on demand
4777 """
4778 _keys = ('width', 'height')
4779 max_dimensions = max(
4780 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4781 default=(0, 0))
4782 if not max_dimensions[0]:
4783 return thumbnails
4784 return [
4785 merge_dicts(
4786 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4787 dict(zip(_keys, max_dimensions)), thumbnail)
4788 for thumbnail in thumbnails
4789 ]
4790
4791
4792 def parse_http_range(range):
4793 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4794 if not range:
4795 return None, None, None
4796 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4797 if not crg:
4798 return None, None, None
4799 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4800
4801
4802 def read_stdin(what):
4803 if what:
4804 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4805 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4806 return sys.stdin
4807
4808
4809 def determine_file_encoding(data):
4810 """
4811 Detect the text encoding used
4812 @returns (encoding, bytes to skip)
4813 """
4814
4815 # BOM marks are given priority over declarations
4816 for bom, enc in BOMS:
4817 if data.startswith(bom):
4818 return enc, len(bom)
4819
4820 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4821 # We ignore the endianness to get a good enough match
4822 data = data.replace(b'\0', b'')
4823 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4824 return mobj.group(1).decode() if mobj else None, 0
4825
4826
4827 class Config:
4828 own_args = None
4829 parsed_args = None
4830 filename = None
4831 __initialized = False
4832
4833 def __init__(self, parser, label=None):
4834 self.parser, self.label = parser, label
4835 self._loaded_paths, self.configs = set(), []
4836
4837 def init(self, args=None, filename=None):
4838 assert not self.__initialized
4839 self.own_args, self.filename = args, filename
4840 return self.load_configs()
4841
4842 def load_configs(self):
4843 directory = ''
4844 if self.filename:
4845 location = os.path.realpath(self.filename)
4846 directory = os.path.dirname(location)
4847 if location in self._loaded_paths:
4848 return False
4849 self._loaded_paths.add(location)
4850
4851 self.__initialized = True
4852 opts, _ = self.parser.parse_known_args(self.own_args)
4853 self.parsed_args = self.own_args
4854 for location in opts.config_locations or []:
4855 if location == '-':
4856 if location in self._loaded_paths:
4857 continue
4858 self._loaded_paths.add(location)
4859 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4860 continue
4861 location = os.path.join(directory, expand_path(location))
4862 if os.path.isdir(location):
4863 location = os.path.join(location, 'yt-dlp.conf')
4864 if not os.path.exists(location):
4865 self.parser.error(f'config location {location} does not exist')
4866 self.append_config(self.read_file(location), location)
4867 return True
4868
4869 def __str__(self):
4870 label = join_nonempty(
4871 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4872 delim=' ')
4873 return join_nonempty(
4874 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4875 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4876 delim='\n')
4877
4878 @staticmethod
4879 def read_file(filename, default=[]):
4880 try:
4881 optionf = open(filename, 'rb')
4882 except OSError:
4883 return default # silently skip if file is not present
4884 try:
4885 enc, skip = determine_file_encoding(optionf.read(512))
4886 optionf.seek(skip, io.SEEK_SET)
4887 except OSError:
4888 enc = None # silently skip read errors
4889 try:
4890 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4891 contents = optionf.read().decode(enc or preferredencoding())
4892 res = shlex.split(contents, comments=True)
4893 except Exception as err:
4894 raise ValueError(f'Unable to parse "{filename}": {err}')
4895 finally:
4896 optionf.close()
4897 return res
4898
4899 @staticmethod
4900 def hide_login_info(opts):
4901 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4902 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4903
4904 def _scrub_eq(o):
4905 m = eqre.match(o)
4906 if m:
4907 return m.group('key') + '=PRIVATE'
4908 else:
4909 return o
4910
4911 opts = list(map(_scrub_eq, opts))
4912 for idx, opt in enumerate(opts):
4913 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4914 opts[idx + 1] = 'PRIVATE'
4915 return opts
4916
4917 def append_config(self, *args, label=None):
4918 config = type(self)(self.parser, label)
4919 config._loaded_paths = self._loaded_paths
4920 if config.init(*args):
4921 self.configs.append(config)
4922
4923 @property
4924 def all_args(self):
4925 for config in reversed(self.configs):
4926 yield from config.all_args
4927 yield from self.parsed_args or []
4928
4929 def parse_known_args(self, **kwargs):
4930 return self.parser.parse_known_args(self.all_args, **kwargs)
4931
4932 def parse_args(self):
4933 return self.parser.parse_args(self.all_args)
4934
4935
4936 def merge_headers(*dicts):
4937 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4938 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4939
4940
4941 def cached_method(f):
4942 """Cache a method"""
4943 signature = inspect.signature(f)
4944
4945 @functools.wraps(f)
4946 def wrapper(self, *args, **kwargs):
4947 bound_args = signature.bind(self, *args, **kwargs)
4948 bound_args.apply_defaults()
4949 key = tuple(bound_args.arguments.values())[1:]
4950
4951 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4952 if key not in cache:
4953 cache[key] = f(self, *args, **kwargs)
4954 return cache[key]
4955 return wrapper
4956
4957
4958 class classproperty:
4959 """property access for class methods with optional caching"""
4960 def __new__(cls, func=None, *args, **kwargs):
4961 if not func:
4962 return functools.partial(cls, *args, **kwargs)
4963 return super().__new__(cls)
4964
4965 def __init__(self, func, *, cache=False):
4966 functools.update_wrapper(self, func)
4967 self.func = func
4968 self._cache = {} if cache else None
4969
4970 def __get__(self, _, cls):
4971 if self._cache is None:
4972 return self.func(cls)
4973 elif cls not in self._cache:
4974 self._cache[cls] = self.func(cls)
4975 return self._cache[cls]
4976
4977
4978 class function_with_repr:
4979 def __init__(self, func, repr_=None):
4980 functools.update_wrapper(self, func)
4981 self.func, self.__repr = func, repr_
4982
4983 def __call__(self, *args, **kwargs):
4984 return self.func(*args, **kwargs)
4985
4986 @classmethod
4987 def set_repr(cls, repr_):
4988 return functools.partial(cls, repr_=repr_)
4989
4990 def __repr__(self):
4991 if self.__repr:
4992 return self.__repr
4993 return f'{self.func.__module__}.{self.func.__qualname__}'
4994
4995
4996 class Namespace(types.SimpleNamespace):
4997 """Immutable namespace"""
4998
4999 def __iter__(self):
5000 return iter(self.__dict__.values())
5001
5002 @property
5003 def items_(self):
5004 return self.__dict__.items()
5005
5006
5007 MEDIA_EXTENSIONS = Namespace(
5008 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5009 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5010 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5011 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5012 thumbnails=('jpg', 'png', 'webp'),
5013 storyboards=('mhtml', ),
5014 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5015 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5016 )
5017 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5018 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5019
5020 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5021
5022
5023 class RetryManager:
5024 """Usage:
5025 for retry in RetryManager(...):
5026 try:
5027 ...
5028 except SomeException as err:
5029 retry.error = err
5030 continue
5031 """
5032 attempt, _error = 0, None
5033
5034 def __init__(self, _retries, _error_callback, **kwargs):
5035 self.retries = _retries or 0
5036 self.error_callback = functools.partial(_error_callback, **kwargs)
5037
5038 def _should_retry(self):
5039 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5040
5041 @property
5042 def error(self):
5043 if self._error is NO_DEFAULT:
5044 return None
5045 return self._error
5046
5047 @error.setter
5048 def error(self, value):
5049 self._error = value
5050
5051 def __iter__(self):
5052 while self._should_retry():
5053 self.error = NO_DEFAULT
5054 self.attempt += 1
5055 yield self
5056 if self.error:
5057 self.error_callback(self.error, self.attempt, self.retries)
5058
5059 @staticmethod
5060 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5061 """Utility function for reporting retries"""
5062 if count > retries:
5063 if error:
5064 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5065 raise e
5066
5067 if not count:
5068 return warn(e)
5069 elif isinstance(e, ExtractorError):
5070 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5071 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5072
5073 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5074 if delay:
5075 info(f'Sleeping {delay:.2f} seconds ...')
5076 time.sleep(delay)
5077
5078
5079 def make_archive_id(ie, video_id):
5080 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5081 return f'{ie_key.lower()} {video_id}'
5082
5083
5084 def truncate_string(s, left, right=0):
5085 assert left > 3 and right >= 0
5086 if s is None or len(s) <= left + right:
5087 return s
5088 return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5089
5090
5091 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5092 assert 'all' in alias_dict, '"all" alias is required'
5093 requested = list(start or [])
5094 for val in options:
5095 discard = val.startswith('-')
5096 if discard:
5097 val = val[1:]
5098
5099 if val in alias_dict:
5100 val = alias_dict[val] if not discard else [
5101 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5102 # NB: Do not allow regex in aliases for performance
5103 requested = orderedSet_from_options(val, alias_dict, start=requested)
5104 continue
5105
5106 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5107 else [val] if val in alias_dict['all'] else None)
5108 if current is None:
5109 raise ValueError(val)
5110
5111 if discard:
5112 for item in current:
5113 while item in requested:
5114 requested.remove(item)
5115 else:
5116 requested.extend(current)
5117
5118 return orderedSet(requested)
5119
5120
5121 # TODO: Rewrite
5122 class FormatSorter:
5123 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5124
5125 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5126 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5127 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5128 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5129 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5130 'fps', 'fs_approx', 'source', 'id')
5131
5132 settings = {
5133 'vcodec': {'type': 'ordered', 'regex': True,
5134 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5135 'acodec': {'type': 'ordered', 'regex': True,
5136 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5137 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5138 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5139 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5140 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5141 'vext': {'type': 'ordered', 'field': 'video_ext',
5142 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5143 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5144 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5145 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5146 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5147 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5148 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5149 'field': ('vcodec', 'acodec'),
5150 'function': lambda it: int(any(v != 'none' for v in it))},
5151 'ie_pref': {'priority': True, 'type': 'extractor'},
5152 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5153 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5154 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5155 'quality': {'convert': 'float', 'default': -1},
5156 'filesize': {'convert': 'bytes'},
5157 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5158 'id': {'convert': 'string', 'field': 'format_id'},
5159 'height': {'convert': 'float_none'},
5160 'width': {'convert': 'float_none'},
5161 'fps': {'convert': 'float_none'},
5162 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5163 'tbr': {'convert': 'float_none'},
5164 'vbr': {'convert': 'float_none'},
5165 'abr': {'convert': 'float_none'},
5166 'asr': {'convert': 'float_none'},
5167 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5168
5169 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5170 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5171 'function': lambda it: next(filter(None, it), None)},
5172 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5173 'function': lambda it: next(filter(None, it), None)},
5174 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5175 'res': {'type': 'multiple', 'field': ('height', 'width'),
5176 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5177
5178 # Actual field names
5179 'format_id': {'type': 'alias', 'field': 'id'},
5180 'preference': {'type': 'alias', 'field': 'ie_pref'},
5181 'language_preference': {'type': 'alias', 'field': 'lang'},
5182 'source_preference': {'type': 'alias', 'field': 'source'},
5183 'protocol': {'type': 'alias', 'field': 'proto'},
5184 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5185 'audio_channels': {'type': 'alias', 'field': 'channels'},
5186
5187 # Deprecated
5188 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5189 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5190 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5191 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5192 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5193 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5194 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5195 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5196 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5197 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5198 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5199 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5200 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5201 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5202 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5203 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5204 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5205 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5206 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5207 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5208 }
5209
5210 def __init__(self, ydl, field_preference):
5211 self.ydl = ydl
5212 self._order = []
5213 self.evaluate_params(self.ydl.params, field_preference)
5214 if ydl.params.get('verbose'):
5215 self.print_verbose_info(self.ydl.write_debug)
5216
5217 def _get_field_setting(self, field, key):
5218 if field not in self.settings:
5219 if key in ('forced', 'priority'):
5220 return False
5221 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5222 'deprecated and may be removed in a future version')
5223 self.settings[field] = {}
5224 propObj = self.settings[field]
5225 if key not in propObj:
5226 type = propObj.get('type')
5227 if key == 'field':
5228 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5229 elif key == 'convert':
5230 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5231 else:
5232 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5233 propObj[key] = default
5234 return propObj[key]
5235
5236 def _resolve_field_value(self, field, value, convertNone=False):
5237 if value is None:
5238 if not convertNone:
5239 return None
5240 else:
5241 value = value.lower()
5242 conversion = self._get_field_setting(field, 'convert')
5243 if conversion == 'ignore':
5244 return None
5245 if conversion == 'string':
5246 return value
5247 elif conversion == 'float_none':
5248 return float_or_none(value)
5249 elif conversion == 'bytes':
5250 return parse_bytes(value)
5251 elif conversion == 'order':
5252 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5253 use_regex = self._get_field_setting(field, 'regex')
5254 list_length = len(order_list)
5255 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5256 if use_regex and value is not None:
5257 for i, regex in enumerate(order_list):
5258 if regex and re.match(regex, value):
5259 return list_length - i
5260 return list_length - empty_pos # not in list
5261 else: # not regex or value = None
5262 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5263 else:
5264 if value.isnumeric():
5265 return float(value)
5266 else:
5267 self.settings[field]['convert'] = 'string'
5268 return value
5269
5270 def evaluate_params(self, params, sort_extractor):
5271 self._use_free_order = params.get('prefer_free_formats', False)
5272 self._sort_user = params.get('format_sort', [])
5273 self._sort_extractor = sort_extractor
5274
5275 def add_item(field, reverse, closest, limit_text):
5276 field = field.lower()
5277 if field in self._order:
5278 return
5279 self._order.append(field)
5280 limit = self._resolve_field_value(field, limit_text)
5281 data = {
5282 'reverse': reverse,
5283 'closest': False if limit is None else closest,
5284 'limit_text': limit_text,
5285 'limit': limit}
5286 if field in self.settings:
5287 self.settings[field].update(data)
5288 else:
5289 self.settings[field] = data
5290
5291 sort_list = (
5292 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5293 + (tuple() if params.get('format_sort_force', False)
5294 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5295 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5296
5297 for item in sort_list:
5298 match = re.match(self.regex, item)
5299 if match is None:
5300 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5301 field = match.group('field')
5302 if field is None:
5303 continue
5304 if self._get_field_setting(field, 'type') == 'alias':
5305 alias, field = field, self._get_field_setting(field, 'field')
5306 if self._get_field_setting(alias, 'deprecated'):
5307 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5308 f'be removed in a future version. Please use {field} instead')
5309 reverse = match.group('reverse') is not None
5310 closest = match.group('separator') == '~'
5311 limit_text = match.group('limit')
5312
5313 has_limit = limit_text is not None
5314 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5315 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5316
5317 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5318 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5319 limit_count = len(limits)
5320 for (i, f) in enumerate(fields):
5321 add_item(f, reverse, closest,
5322 limits[i] if i < limit_count
5323 else limits[0] if has_limit and not has_multiple_limits
5324 else None)
5325
5326 def print_verbose_info(self, write_debug):
5327 if self._sort_user:
5328 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5329 if self._sort_extractor:
5330 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5331 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5332 '+' if self._get_field_setting(field, 'reverse') else '', field,
5333 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5334 self._get_field_setting(field, 'limit_text'),
5335 self._get_field_setting(field, 'limit'))
5336 if self._get_field_setting(field, 'limit_text') is not None else '')
5337 for field in self._order if self._get_field_setting(field, 'visible')]))
5338
5339 def _calculate_field_preference_from_value(self, format, field, type, value):
5340 reverse = self._get_field_setting(field, 'reverse')
5341 closest = self._get_field_setting(field, 'closest')
5342 limit = self._get_field_setting(field, 'limit')
5343
5344 if type == 'extractor':
5345 maximum = self._get_field_setting(field, 'max')
5346 if value is None or (maximum is not None and value >= maximum):
5347 value = -1
5348 elif type == 'boolean':
5349 in_list = self._get_field_setting(field, 'in_list')
5350 not_in_list = self._get_field_setting(field, 'not_in_list')
5351 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5352 elif type == 'ordered':
5353 value = self._resolve_field_value(field, value, True)
5354
5355 # try to convert to number
5356 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5357 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5358 if is_num:
5359 value = val_num
5360
5361 return ((-10, 0) if value is None
5362 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5363 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5364 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5365 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5366 else (-1, value, 0))
5367
5368 def _calculate_field_preference(self, format, field):
5369 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5370 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5371 if type == 'multiple':
5372 type = 'field' # Only 'field' is allowed in multiple for now
5373 actual_fields = self._get_field_setting(field, 'field')
5374
5375 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5376 else:
5377 value = get_value(field)
5378 return self._calculate_field_preference_from_value(format, field, type, value)
5379
5380 def calculate_preference(self, format):
5381 # Determine missing protocol
5382 if not format.get('protocol'):
5383 format['protocol'] = determine_protocol(format)
5384
5385 # Determine missing ext
5386 if not format.get('ext') and 'url' in format:
5387 format['ext'] = determine_ext(format['url'])
5388 if format.get('vcodec') == 'none':
5389 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5390 format['video_ext'] = 'none'
5391 else:
5392 format['video_ext'] = format['ext']
5393 format['audio_ext'] = 'none'
5394 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5395 # format['preference'] = -1000
5396
5397 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5398 # HEVC-over-FLV is out-of-spec by FLV's original spec
5399 # ref. https://trac.ffmpeg.org/ticket/6389
5400 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5401 format['preference'] = -100
5402
5403 # Determine missing bitrates
5404 if format.get('vcodec') == 'none':
5405 format['vbr'] = 0
5406 if format.get('acodec') == 'none':
5407 format['abr'] = 0
5408 if not format.get('vbr') and format.get('vcodec') != 'none':
5409 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5410 if not format.get('abr') and format.get('acodec') != 'none':
5411 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5412 if not format.get('tbr'):
5413 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5414
5415 return tuple(self._calculate_field_preference(format, field) for field in self._order)
5416
5417
5418 # XXX: Temporary
5419 class _YDLLogger:
5420 def __init__(self, ydl=None):
5421 self._ydl = ydl
5422
5423 def debug(self, message):
5424 if self._ydl:
5425 self._ydl.write_debug(message)
5426
5427 def info(self, message):
5428 if self._ydl:
5429 self._ydl.to_screen(message)
5430
5431 def warning(self, message, *, once=False):
5432 if self._ydl:
5433 self._ydl.report_warning(message, once)
5434
5435 def error(self, message, *, is_error=True):
5436 if self._ydl:
5437 self._ydl.report_error(message, is_error=is_error)
5438
5439 def stdout(self, message):
5440 if self._ydl:
5441 self._ydl.to_stdout(message)
5442
5443 def stderr(self, message):
5444 if self._ydl:
5445 self._ydl.to_stderr(message)