]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils/_utils.py
[cleanup] Fix misc bugs (#8968)
[yt-dlp.git] / yt_dlp / utils / _utils.py
1 import base64
2 import binascii
3 import calendar
4 import codecs
5 import collections
6 import collections.abc
7 import contextlib
8 import datetime
9 import email.header
10 import email.utils
11 import errno
12 import hashlib
13 import hmac
14 import html.entities
15 import html.parser
16 import inspect
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import mimetypes
23 import netrc
24 import operator
25 import os
26 import platform
27 import random
28 import re
29 import shlex
30 import socket
31 import ssl
32 import struct
33 import subprocess
34 import sys
35 import tempfile
36 import time
37 import traceback
38 import types
39 import unicodedata
40 import urllib.error
41 import urllib.parse
42 import urllib.request
43 import xml.etree.ElementTree
44
45 from . import traversal
46
47 from ..compat import functools # isort: split
48 from ..compat import (
49 compat_etree_fromstring,
50 compat_expanduser,
51 compat_HTMLParseError,
52 compat_os_name,
53 compat_shlex_quote,
54 )
55 from ..dependencies import xattr
56
57 __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
58
59 # This is not clearly defined otherwise
60 compiled_regex_type = type(re.compile(''))
61
62
63 class NO_DEFAULT:
64 pass
65
66
67 def IDENTITY(x):
68 return x
69
70
71 ENGLISH_MONTH_NAMES = [
72 'January', 'February', 'March', 'April', 'May', 'June',
73 'July', 'August', 'September', 'October', 'November', 'December']
74
75 MONTH_NAMES = {
76 'en': ENGLISH_MONTH_NAMES,
77 'fr': [
78 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
79 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
80 # these follow the genitive grammatical case (dopełniacz)
81 # some websites might be using nominative, which will require another month list
82 # https://en.wikibooks.org/wiki/Polish/Noun_cases
83 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
84 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
85 }
86
87 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
88 TIMEZONE_NAMES = {
89 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
90 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
91 'EST': -5, 'EDT': -4, # Eastern
92 'CST': -6, 'CDT': -5, # Central
93 'MST': -7, 'MDT': -6, # Mountain
94 'PST': -8, 'PDT': -7 # Pacific
95 }
96
97 # needed for sanitizing filenames in restricted mode
98 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
99 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
100 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
101
102 DATE_FORMATS = (
103 '%d %B %Y',
104 '%d %b %Y',
105 '%B %d %Y',
106 '%B %dst %Y',
107 '%B %dnd %Y',
108 '%B %drd %Y',
109 '%B %dth %Y',
110 '%b %d %Y',
111 '%b %dst %Y',
112 '%b %dnd %Y',
113 '%b %drd %Y',
114 '%b %dth %Y',
115 '%b %dst %Y %I:%M',
116 '%b %dnd %Y %I:%M',
117 '%b %drd %Y %I:%M',
118 '%b %dth %Y %I:%M',
119 '%Y %m %d',
120 '%Y-%m-%d',
121 '%Y.%m.%d.',
122 '%Y/%m/%d',
123 '%Y/%m/%d %H:%M',
124 '%Y/%m/%d %H:%M:%S',
125 '%Y%m%d%H%M',
126 '%Y%m%d%H%M%S',
127 '%Y%m%d',
128 '%Y-%m-%d %H:%M',
129 '%Y-%m-%d %H:%M:%S',
130 '%Y-%m-%d %H:%M:%S.%f',
131 '%Y-%m-%d %H:%M:%S:%f',
132 '%d.%m.%Y %H:%M',
133 '%d.%m.%Y %H.%M',
134 '%Y-%m-%dT%H:%M:%SZ',
135 '%Y-%m-%dT%H:%M:%S.%fZ',
136 '%Y-%m-%dT%H:%M:%S.%f0Z',
137 '%Y-%m-%dT%H:%M:%S',
138 '%Y-%m-%dT%H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M',
140 '%b %d %Y at %H:%M',
141 '%b %d %Y at %H:%M:%S',
142 '%B %d %Y at %H:%M',
143 '%B %d %Y at %H:%M:%S',
144 '%H:%M %d-%b-%Y',
145 )
146
147 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
148 DATE_FORMATS_DAY_FIRST.extend([
149 '%d-%m-%Y',
150 '%d.%m.%Y',
151 '%d.%m.%y',
152 '%d/%m/%Y',
153 '%d/%m/%y',
154 '%d/%m/%Y %H:%M:%S',
155 '%d-%m-%Y %H:%M',
156 '%H:%M %d/%m/%Y',
157 ])
158
159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160 DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166 ])
167
168 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
169 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
170
171 NUMBER_RE = r'\d+(?:\.\d+)?'
172
173
174 @functools.cache
175 def preferredencoding():
176 """Get preferred encoding.
177
178 Returns the best encoding scheme for the system, based on
179 locale.getpreferredencoding() and some further tweaks.
180 """
181 try:
182 pref = locale.getpreferredencoding()
183 'TEST'.encode(pref)
184 except Exception:
185 pref = 'UTF-8'
186
187 return pref
188
189
190 def write_json_file(obj, fn):
191 """ Encode obj as JSON and write it to fn, atomically if possible """
192
193 tf = tempfile.NamedTemporaryFile(
194 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
195 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
196
197 try:
198 with tf:
199 json.dump(obj, tf, ensure_ascii=False)
200 if sys.platform == 'win32':
201 # Need to remove existing file on Windows, else os.rename raises
202 # WindowsError or FileExistsError.
203 with contextlib.suppress(OSError):
204 os.unlink(fn)
205 with contextlib.suppress(OSError):
206 mask = os.umask(0)
207 os.umask(mask)
208 os.chmod(tf.name, 0o666 & ~mask)
209 os.rename(tf.name, fn)
210 except Exception:
211 with contextlib.suppress(OSError):
212 os.remove(tf.name)
213 raise
214
215
216 def find_xpath_attr(node, xpath, key, val=None):
217 """ Find the xpath xpath[@key=val] """
218 assert re.match(r'^[a-zA-Z_-]+$', key)
219 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
220 return node.find(expr)
221
222 # On python2.6 the xml.etree.ElementTree.Element methods don't support
223 # the namespace parameter
224
225
226 def xpath_with_ns(path, ns_map):
227 components = [c.split(':') for c in path.split('/')]
228 replaced = []
229 for c in components:
230 if len(c) == 1:
231 replaced.append(c[0])
232 else:
233 ns, tag = c
234 replaced.append('{%s}%s' % (ns_map[ns], tag))
235 return '/'.join(replaced)
236
237
238 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
239 def _find_xpath(xpath):
240 return node.find(xpath)
241
242 if isinstance(xpath, str):
243 n = _find_xpath(xpath)
244 else:
245 for xp in xpath:
246 n = _find_xpath(xp)
247 if n is not None:
248 break
249
250 if n is None:
251 if default is not NO_DEFAULT:
252 return default
253 elif fatal:
254 name = xpath if name is None else name
255 raise ExtractorError('Could not find XML element %s' % name)
256 else:
257 return None
258 return n
259
260
261 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
262 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
263 if n is None or n == default:
264 return n
265 if n.text is None:
266 if default is not NO_DEFAULT:
267 return default
268 elif fatal:
269 name = xpath if name is None else name
270 raise ExtractorError('Could not find XML element\'s text %s' % name)
271 else:
272 return None
273 return n.text
274
275
276 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
277 n = find_xpath_attr(node, xpath, key)
278 if n is None:
279 if default is not NO_DEFAULT:
280 return default
281 elif fatal:
282 name = f'{xpath}[@{key}]' if name is None else name
283 raise ExtractorError('Could not find XML attribute %s' % name)
284 else:
285 return None
286 return n.attrib[key]
287
288
289 def get_element_by_id(id, html, **kwargs):
290 """Return the content of the tag with the specified ID in the passed HTML document"""
291 return get_element_by_attribute('id', id, html, **kwargs)
292
293
294 def get_element_html_by_id(id, html, **kwargs):
295 """Return the html of the tag with the specified ID in the passed HTML document"""
296 return get_element_html_by_attribute('id', id, html, **kwargs)
297
298
299 def get_element_by_class(class_name, html):
300 """Return the content of the first tag with the specified class in the passed HTML document"""
301 retval = get_elements_by_class(class_name, html)
302 return retval[0] if retval else None
303
304
305 def get_element_html_by_class(class_name, html):
306 """Return the html of the first tag with the specified class in the passed HTML document"""
307 retval = get_elements_html_by_class(class_name, html)
308 return retval[0] if retval else None
309
310
311 def get_element_by_attribute(attribute, value, html, **kwargs):
312 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
313 return retval[0] if retval else None
314
315
316 def get_element_html_by_attribute(attribute, value, html, **kargs):
317 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
318 return retval[0] if retval else None
319
320
321 def get_elements_by_class(class_name, html, **kargs):
322 """Return the content of all tags with the specified class in the passed HTML document as a list"""
323 return get_elements_by_attribute(
324 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
325 html, escape_value=False)
326
327
328 def get_elements_html_by_class(class_name, html):
329 """Return the html of all tags with the specified class in the passed HTML document as a list"""
330 return get_elements_html_by_attribute(
331 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
332 html, escape_value=False)
333
334
335 def get_elements_by_attribute(*args, **kwargs):
336 """Return the content of the tag with the specified attribute in the passed HTML document"""
337 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
338
339
340 def get_elements_html_by_attribute(*args, **kwargs):
341 """Return the html of the tag with the specified attribute in the passed HTML document"""
342 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
343
344
345 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
346 """
347 Return the text (content) and the html (whole) of the tag with the specified
348 attribute in the passed HTML document
349 """
350 if not value:
351 return
352
353 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
354
355 value = re.escape(value) if escape_value else value
356
357 partial_element_re = rf'''(?x)
358 <(?P<tag>{tag})
359 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
360 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
361 '''
362
363 for m in re.finditer(partial_element_re, html):
364 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
365
366 yield (
367 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
368 whole
369 )
370
371
372 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
373 """
374 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
375 closing tag for the first opening tag it has encountered, and can be used
376 as a context manager
377 """
378
379 class HTMLBreakOnClosingTagException(Exception):
380 pass
381
382 def __init__(self):
383 self.tagstack = collections.deque()
384 html.parser.HTMLParser.__init__(self)
385
386 def __enter__(self):
387 return self
388
389 def __exit__(self, *_):
390 self.close()
391
392 def close(self):
393 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
394 # so data remains buffered; we no longer have any interest in it, thus
395 # override this method to discard it
396 pass
397
398 def handle_starttag(self, tag, _):
399 self.tagstack.append(tag)
400
401 def handle_endtag(self, tag):
402 if not self.tagstack:
403 raise compat_HTMLParseError('no tags in the stack')
404 while self.tagstack:
405 inner_tag = self.tagstack.pop()
406 if inner_tag == tag:
407 break
408 else:
409 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
410 if not self.tagstack:
411 raise self.HTMLBreakOnClosingTagException()
412
413
414 # XXX: This should be far less strict
415 def get_element_text_and_html_by_tag(tag, html):
416 """
417 For the first element with the specified tag in the passed HTML document
418 return its' content (text) and the whole element (html)
419 """
420 def find_or_raise(haystack, needle, exc):
421 try:
422 return haystack.index(needle)
423 except ValueError:
424 raise exc
425 closing_tag = f'</{tag}>'
426 whole_start = find_or_raise(
427 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
428 content_start = find_or_raise(
429 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
430 content_start += whole_start + 1
431 with HTMLBreakOnClosingTagParser() as parser:
432 parser.feed(html[whole_start:content_start])
433 if not parser.tagstack or parser.tagstack[0] != tag:
434 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
435 offset = content_start
436 while offset < len(html):
437 next_closing_tag_start = find_or_raise(
438 html[offset:], closing_tag,
439 compat_HTMLParseError(f'closing {tag} tag not found'))
440 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
441 try:
442 parser.feed(html[offset:offset + next_closing_tag_end])
443 offset += next_closing_tag_end
444 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
445 return html[content_start:offset + next_closing_tag_start], \
446 html[whole_start:offset + next_closing_tag_end]
447 raise compat_HTMLParseError('unexpected end of html')
448
449
450 class HTMLAttributeParser(html.parser.HTMLParser):
451 """Trivial HTML parser to gather the attributes for a single element"""
452
453 def __init__(self):
454 self.attrs = {}
455 html.parser.HTMLParser.__init__(self)
456
457 def handle_starttag(self, tag, attrs):
458 self.attrs = dict(attrs)
459 raise compat_HTMLParseError('done')
460
461
462 class HTMLListAttrsParser(html.parser.HTMLParser):
463 """HTML parser to gather the attributes for the elements of a list"""
464
465 def __init__(self):
466 html.parser.HTMLParser.__init__(self)
467 self.items = []
468 self._level = 0
469
470 def handle_starttag(self, tag, attrs):
471 if tag == 'li' and self._level == 0:
472 self.items.append(dict(attrs))
473 self._level += 1
474
475 def handle_endtag(self, tag):
476 self._level -= 1
477
478
479 def extract_attributes(html_element):
480 """Given a string for an HTML element such as
481 <el
482 a="foo" B="bar" c="&98;az" d=boz
483 empty= noval entity="&amp;"
484 sq='"' dq="'"
485 >
486 Decode and return a dictionary of attributes.
487 {
488 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
489 'empty': '', 'noval': None, 'entity': '&',
490 'sq': '"', 'dq': '\''
491 }.
492 """
493 parser = HTMLAttributeParser()
494 with contextlib.suppress(compat_HTMLParseError):
495 parser.feed(html_element)
496 parser.close()
497 return parser.attrs
498
499
500 def parse_list(webpage):
501 """Given a string for an series of HTML <li> elements,
502 return a dictionary of their attributes"""
503 parser = HTMLListAttrsParser()
504 parser.feed(webpage)
505 parser.close()
506 return parser.items
507
508
509 def clean_html(html):
510 """Clean an HTML snippet into a readable string"""
511
512 if html is None: # Convenience for sanitizing descriptions etc.
513 return html
514
515 html = re.sub(r'\s+', ' ', html)
516 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
517 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
518 # Strip html tags
519 html = re.sub('<.*?>', '', html)
520 # Replace html entities
521 html = unescapeHTML(html)
522 return html.strip()
523
524
525 class LenientJSONDecoder(json.JSONDecoder):
526 # TODO: Write tests
527 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
528 self.transform_source, self.ignore_extra = transform_source, ignore_extra
529 self._close_attempts = 2 * close_objects
530 super().__init__(*args, **kwargs)
531
532 @staticmethod
533 def _close_object(err):
534 doc = err.doc[:err.pos]
535 # We need to add comma first to get the correct error message
536 if err.msg.startswith('Expecting \',\''):
537 return doc + ','
538 elif not doc.endswith(','):
539 return
540
541 if err.msg.startswith('Expecting property name'):
542 return doc[:-1] + '}'
543 elif err.msg.startswith('Expecting value'):
544 return doc[:-1] + ']'
545
546 def decode(self, s):
547 if self.transform_source:
548 s = self.transform_source(s)
549 for attempt in range(self._close_attempts + 1):
550 try:
551 if self.ignore_extra:
552 return self.raw_decode(s.lstrip())[0]
553 return super().decode(s)
554 except json.JSONDecodeError as e:
555 if e.pos is None:
556 raise
557 elif attempt < self._close_attempts:
558 s = self._close_object(e)
559 if s is not None:
560 continue
561 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
562 assert False, 'Too many attempts to decode JSON'
563
564
565 def sanitize_open(filename, open_mode):
566 """Try to open the given filename, and slightly tweak it if this fails.
567
568 Attempts to open the given filename. If this fails, it tries to change
569 the filename slightly, step by step, until it's either able to open it
570 or it fails and raises a final exception, like the standard open()
571 function.
572
573 It returns the tuple (stream, definitive_file_name).
574 """
575 if filename == '-':
576 if sys.platform == 'win32':
577 import msvcrt
578
579 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
580 with contextlib.suppress(io.UnsupportedOperation):
581 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
582 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
583
584 for attempt in range(2):
585 try:
586 try:
587 if sys.platform == 'win32':
588 # FIXME: An exclusive lock also locks the file from being read.
589 # Since windows locks are mandatory, don't lock the file on windows (for now).
590 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
591 raise LockingUnsupportedError()
592 stream = locked_file(filename, open_mode, block=False).__enter__()
593 except OSError:
594 stream = open(filename, open_mode)
595 return stream, filename
596 except OSError as err:
597 if attempt or err.errno in (errno.EACCES,):
598 raise
599 old_filename, filename = filename, sanitize_path(filename)
600 if old_filename == filename:
601 raise
602
603
604 def timeconvert(timestr):
605 """Convert RFC 2822 defined time string into system timestamp"""
606 timestamp = None
607 timetuple = email.utils.parsedate_tz(timestr)
608 if timetuple is not None:
609 timestamp = email.utils.mktime_tz(timetuple)
610 return timestamp
611
612
613 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
614 """Sanitizes a string so it could be used as part of a filename.
615 @param restricted Use a stricter subset of allowed characters
616 @param is_id Whether this is an ID that should be kept unchanged if possible.
617 If unset, yt-dlp's new sanitization rules are in effect
618 """
619 if s == '':
620 return ''
621
622 def replace_insane(char):
623 if restricted and char in ACCENT_CHARS:
624 return ACCENT_CHARS[char]
625 elif not restricted and char == '\n':
626 return '\0 '
627 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
628 # Replace with their full-width unicode counterparts
629 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
630 elif char == '?' or ord(char) < 32 or ord(char) == 127:
631 return ''
632 elif char == '"':
633 return '' if restricted else '\''
634 elif char == ':':
635 return '\0_\0-' if restricted else '\0 \0-'
636 elif char in '\\/|*<>':
637 return '\0_'
638 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
639 return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
640 return char
641
642 # Replace look-alike Unicode glyphs
643 if restricted and (is_id is NO_DEFAULT or not is_id):
644 s = unicodedata.normalize('NFKC', s)
645 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
646 result = ''.join(map(replace_insane, s))
647 if is_id is NO_DEFAULT:
648 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
649 STRIP_RE = r'(?:\0.|[ _-])*'
650 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
651 result = result.replace('\0', '') or '_'
652
653 if not is_id:
654 while '__' in result:
655 result = result.replace('__', '_')
656 result = result.strip('_')
657 # Common case of "Foreign band name - English song title"
658 if restricted and result.startswith('-_'):
659 result = result[2:]
660 if result.startswith('-'):
661 result = '_' + result[len('-'):]
662 result = result.lstrip('.')
663 if not result:
664 result = '_'
665 return result
666
667
668 def sanitize_path(s, force=False):
669 """Sanitizes and normalizes path on Windows"""
670 # XXX: this handles drive relative paths (c:sth) incorrectly
671 if sys.platform == 'win32':
672 force = False
673 drive_or_unc, _ = os.path.splitdrive(s)
674 elif force:
675 drive_or_unc = ''
676 else:
677 return s
678
679 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
680 if drive_or_unc:
681 norm_path.pop(0)
682 sanitized_path = [
683 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
684 for path_part in norm_path]
685 if drive_or_unc:
686 sanitized_path.insert(0, drive_or_unc + os.path.sep)
687 elif force and s and s[0] == os.path.sep:
688 sanitized_path.insert(0, os.path.sep)
689 # TODO: Fix behavioral differences <3.12
690 # The workaround using `normpath` only superficially passes tests
691 # Ref: https://github.com/python/cpython/pull/100351
692 return os.path.normpath(os.path.join(*sanitized_path))
693
694
695 def sanitize_url(url, *, scheme='http'):
696 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
697 # the number of unwanted failures due to missing protocol
698 if url is None:
699 return
700 elif url.startswith('//'):
701 return f'{scheme}:{url}'
702 # Fix some common typos seen so far
703 COMMON_TYPOS = (
704 # https://github.com/ytdl-org/youtube-dl/issues/15649
705 (r'^httpss://', r'https://'),
706 # https://bx1.be/lives/direct-tv/
707 (r'^rmtp([es]?)://', r'rtmp\1://'),
708 )
709 for mistake, fixup in COMMON_TYPOS:
710 if re.match(mistake, url):
711 return re.sub(mistake, fixup, url)
712 return url
713
714
715 def extract_basic_auth(url):
716 parts = urllib.parse.urlsplit(url)
717 if parts.username is None:
718 return url, None
719 url = urllib.parse.urlunsplit(parts._replace(netloc=(
720 parts.hostname if parts.port is None
721 else '%s:%d' % (parts.hostname, parts.port))))
722 auth_payload = base64.b64encode(
723 ('%s:%s' % (parts.username, parts.password or '')).encode())
724 return url, f'Basic {auth_payload.decode()}'
725
726
727 def expand_path(s):
728 """Expand shell variables and ~"""
729 return os.path.expandvars(compat_expanduser(s))
730
731
732 def orderedSet(iterable, *, lazy=False):
733 """Remove all duplicates from the input iterable"""
734 def _iter():
735 seen = [] # Do not use set since the items can be unhashable
736 for x in iterable:
737 if x not in seen:
738 seen.append(x)
739 yield x
740
741 return _iter() if lazy else list(_iter())
742
743
744 def _htmlentity_transform(entity_with_semicolon):
745 """Transforms an HTML entity to a character."""
746 entity = entity_with_semicolon[:-1]
747
748 # Known non-numeric HTML entity
749 if entity in html.entities.name2codepoint:
750 return chr(html.entities.name2codepoint[entity])
751
752 # TODO: HTML5 allows entities without a semicolon.
753 # E.g. '&Eacuteric' should be decoded as 'Éric'.
754 if entity_with_semicolon in html.entities.html5:
755 return html.entities.html5[entity_with_semicolon]
756
757 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
758 if mobj is not None:
759 numstr = mobj.group(1)
760 if numstr.startswith('x'):
761 base = 16
762 numstr = '0%s' % numstr
763 else:
764 base = 10
765 # See https://github.com/ytdl-org/youtube-dl/issues/7518
766 with contextlib.suppress(ValueError):
767 return chr(int(numstr, base))
768
769 # Unknown entity in name, return its literal representation
770 return '&%s;' % entity
771
772
773 def unescapeHTML(s):
774 if s is None:
775 return None
776 assert isinstance(s, str)
777
778 return re.sub(
779 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
780
781
782 def escapeHTML(text):
783 return (
784 text
785 .replace('&', '&amp;')
786 .replace('<', '&lt;')
787 .replace('>', '&gt;')
788 .replace('"', '&quot;')
789 .replace("'", '&#39;')
790 )
791
792
793 class netrc_from_content(netrc.netrc):
794 def __init__(self, content):
795 self.hosts, self.macros = {}, {}
796 with io.StringIO(content) as stream:
797 self._parse('-', stream, False)
798
799
800 class Popen(subprocess.Popen):
801 if sys.platform == 'win32':
802 _startupinfo = subprocess.STARTUPINFO()
803 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
804 else:
805 _startupinfo = None
806
807 @staticmethod
808 def _fix_pyinstaller_ld_path(env):
809 """Restore LD_LIBRARY_PATH when using PyInstaller
810 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
811 https://github.com/yt-dlp/yt-dlp/issues/4573
812 """
813 if not hasattr(sys, '_MEIPASS'):
814 return
815
816 def _fix(key):
817 orig = env.get(f'{key}_ORIG')
818 if orig is None:
819 env.pop(key, None)
820 else:
821 env[key] = orig
822
823 _fix('LD_LIBRARY_PATH') # Linux
824 _fix('DYLD_LIBRARY_PATH') # macOS
825
826 def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
827 if env is None:
828 env = os.environ.copy()
829 self._fix_pyinstaller_ld_path(env)
830
831 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
832 if text is True:
833 kwargs['universal_newlines'] = True # For 3.6 compatibility
834 kwargs.setdefault('encoding', 'utf-8')
835 kwargs.setdefault('errors', 'replace')
836
837 if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
838 if not isinstance(args, str):
839 args = ' '.join(compat_shlex_quote(a) for a in args)
840 shell = False
841 args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
842
843 super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
844
845 def __comspec(self):
846 comspec = os.environ.get('ComSpec') or os.path.join(
847 os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
848 if os.path.isabs(comspec):
849 return comspec
850 raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
851
852 def communicate_or_kill(self, *args, **kwargs):
853 try:
854 return self.communicate(*args, **kwargs)
855 except BaseException: # Including KeyboardInterrupt
856 self.kill(timeout=None)
857 raise
858
859 def kill(self, *, timeout=0):
860 super().kill()
861 if timeout != 0:
862 self.wait(timeout=timeout)
863
864 @classmethod
865 def run(cls, *args, timeout=None, **kwargs):
866 with cls(*args, **kwargs) as proc:
867 default = '' if proc.__text_mode else b''
868 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
869 return stdout or default, stderr or default, proc.returncode
870
871
872 def encodeArgument(s):
873 # Legacy code that uses byte strings
874 # Uncomment the following line after fixing all post processors
875 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
876 return s if isinstance(s, str) else s.decode('ascii')
877
878
879 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
880
881
882 def timetuple_from_msec(msec):
883 secs, msec = divmod(msec, 1000)
884 mins, secs = divmod(secs, 60)
885 hrs, mins = divmod(mins, 60)
886 return _timetuple(hrs, mins, secs, msec)
887
888
889 def formatSeconds(secs, delim=':', msec=False):
890 time = timetuple_from_msec(secs * 1000)
891 if time.hours:
892 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
893 elif time.minutes:
894 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
895 else:
896 ret = '%d' % time.seconds
897 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
898
899
900 def bug_reports_message(before=';'):
901 from ..update import REPOSITORY
902
903 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
904 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
905
906 before = before.rstrip()
907 if not before or before.endswith(('.', '!', '?')):
908 msg = msg[0].title() + msg[1:]
909
910 return (before + ' ' if before else '') + msg
911
912
913 class YoutubeDLError(Exception):
914 """Base exception for YoutubeDL errors."""
915 msg = None
916
917 def __init__(self, msg=None):
918 if msg is not None:
919 self.msg = msg
920 elif self.msg is None:
921 self.msg = type(self).__name__
922 super().__init__(self.msg)
923
924
925 class ExtractorError(YoutubeDLError):
926 """Error during info extraction."""
927
928 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
929 """ tb, if given, is the original traceback (so that it can be printed out).
930 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
931 """
932 from ..networking.exceptions import network_exceptions
933 if sys.exc_info()[0] in network_exceptions:
934 expected = True
935
936 self.orig_msg = str(msg)
937 self.traceback = tb
938 self.expected = expected
939 self.cause = cause
940 self.video_id = video_id
941 self.ie = ie
942 self.exc_info = sys.exc_info() # preserve original exception
943 if isinstance(self.exc_info[1], ExtractorError):
944 self.exc_info = self.exc_info[1].exc_info
945 super().__init__(self.__msg)
946
947 @property
948 def __msg(self):
949 return ''.join((
950 format_field(self.ie, None, '[%s] '),
951 format_field(self.video_id, None, '%s: '),
952 self.orig_msg,
953 format_field(self.cause, None, ' (caused by %r)'),
954 '' if self.expected else bug_reports_message()))
955
956 def format_traceback(self):
957 return join_nonempty(
958 self.traceback and ''.join(traceback.format_tb(self.traceback)),
959 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
960 delim='\n') or None
961
962 def __setattr__(self, name, value):
963 super().__setattr__(name, value)
964 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
965 self.msg = self.__msg or type(self).__name__
966 self.args = (self.msg, ) # Cannot be property
967
968
969 class UnsupportedError(ExtractorError):
970 def __init__(self, url):
971 super().__init__(
972 'Unsupported URL: %s' % url, expected=True)
973 self.url = url
974
975
976 class RegexNotFoundError(ExtractorError):
977 """Error when a regex didn't match"""
978 pass
979
980
981 class GeoRestrictedError(ExtractorError):
982 """Geographic restriction Error exception.
983
984 This exception may be thrown when a video is not available from your
985 geographic location due to geographic restrictions imposed by a website.
986 """
987
988 def __init__(self, msg, countries=None, **kwargs):
989 kwargs['expected'] = True
990 super().__init__(msg, **kwargs)
991 self.countries = countries
992
993
994 class UserNotLive(ExtractorError):
995 """Error when a channel/user is not live"""
996
997 def __init__(self, msg=None, **kwargs):
998 kwargs['expected'] = True
999 super().__init__(msg or 'The channel is not currently live', **kwargs)
1000
1001
1002 class DownloadError(YoutubeDLError):
1003 """Download Error exception.
1004
1005 This exception may be thrown by FileDownloader objects if they are not
1006 configured to continue on errors. They will contain the appropriate
1007 error message.
1008 """
1009
1010 def __init__(self, msg, exc_info=None):
1011 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1012 super().__init__(msg)
1013 self.exc_info = exc_info
1014
1015
1016 class EntryNotInPlaylist(YoutubeDLError):
1017 """Entry not in playlist exception.
1018
1019 This exception will be thrown by YoutubeDL when a requested entry
1020 is not found in the playlist info_dict
1021 """
1022 msg = 'Entry not found in info'
1023
1024
1025 class SameFileError(YoutubeDLError):
1026 """Same File exception.
1027
1028 This exception will be thrown by FileDownloader objects if they detect
1029 multiple files would have to be downloaded to the same file on disk.
1030 """
1031 msg = 'Fixed output name but more than one file to download'
1032
1033 def __init__(self, filename=None):
1034 if filename is not None:
1035 self.msg += f': {filename}'
1036 super().__init__(self.msg)
1037
1038
1039 class PostProcessingError(YoutubeDLError):
1040 """Post Processing exception.
1041
1042 This exception may be raised by PostProcessor's .run() method to
1043 indicate an error in the postprocessing task.
1044 """
1045
1046
1047 class DownloadCancelled(YoutubeDLError):
1048 """ Exception raised when the download queue should be interrupted """
1049 msg = 'The download was cancelled'
1050
1051
1052 class ExistingVideoReached(DownloadCancelled):
1053 """ --break-on-existing triggered """
1054 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1055
1056
1057 class RejectedVideoReached(DownloadCancelled):
1058 """ --break-match-filter triggered """
1059 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1060
1061
1062 class MaxDownloadsReached(DownloadCancelled):
1063 """ --max-downloads limit has been reached. """
1064 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1065
1066
1067 class ReExtractInfo(YoutubeDLError):
1068 """ Video info needs to be re-extracted. """
1069
1070 def __init__(self, msg, expected=False):
1071 super().__init__(msg)
1072 self.expected = expected
1073
1074
1075 class ThrottledDownload(ReExtractInfo):
1076 """ Download speed below --throttled-rate. """
1077 msg = 'The download speed is below throttle limit'
1078
1079 def __init__(self):
1080 super().__init__(self.msg, expected=False)
1081
1082
1083 class UnavailableVideoError(YoutubeDLError):
1084 """Unavailable Format exception.
1085
1086 This exception will be thrown when a video is requested
1087 in a format that is not available for that video.
1088 """
1089 msg = 'Unable to download video'
1090
1091 def __init__(self, err=None):
1092 if err is not None:
1093 self.msg += f': {err}'
1094 super().__init__(self.msg)
1095
1096
1097 class ContentTooShortError(YoutubeDLError):
1098 """Content Too Short exception.
1099
1100 This exception may be raised by FileDownloader objects when a file they
1101 download is too small for what the server announced first, indicating
1102 the connection was probably interrupted.
1103 """
1104
1105 def __init__(self, downloaded, expected):
1106 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1107 # Both in bytes
1108 self.downloaded = downloaded
1109 self.expected = expected
1110
1111
1112 class XAttrMetadataError(YoutubeDLError):
1113 def __init__(self, code=None, msg='Unknown error'):
1114 super().__init__(msg)
1115 self.code = code
1116 self.msg = msg
1117
1118 # Parsing code and msg
1119 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1120 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1121 self.reason = 'NO_SPACE'
1122 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1123 self.reason = 'VALUE_TOO_LONG'
1124 else:
1125 self.reason = 'NOT_SUPPORTED'
1126
1127
1128 class XAttrUnavailableError(YoutubeDLError):
1129 pass
1130
1131
1132 def is_path_like(f):
1133 return isinstance(f, (str, bytes, os.PathLike))
1134
1135
1136 def extract_timezone(date_str):
1137 m = re.search(
1138 r'''(?x)
1139 ^.{8,}? # >=8 char non-TZ prefix, if present
1140 (?P<tz>Z| # just the UTC Z, or
1141 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1142 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1143 [ ]? # optional space
1144 (?P<sign>\+|-) # +/-
1145 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1146 $)
1147 ''', date_str)
1148 if not m:
1149 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1150 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1151 if timezone is not None:
1152 date_str = date_str[:-len(m.group('tz'))]
1153 timezone = datetime.timedelta(hours=timezone or 0)
1154 else:
1155 date_str = date_str[:-len(m.group('tz'))]
1156 if not m.group('sign'):
1157 timezone = datetime.timedelta()
1158 else:
1159 sign = 1 if m.group('sign') == '+' else -1
1160 timezone = datetime.timedelta(
1161 hours=sign * int(m.group('hours')),
1162 minutes=sign * int(m.group('minutes')))
1163 return timezone, date_str
1164
1165
1166 def parse_iso8601(date_str, delimiter='T', timezone=None):
1167 """ Return a UNIX timestamp from the given date """
1168
1169 if date_str is None:
1170 return None
1171
1172 date_str = re.sub(r'\.[0-9]+', '', date_str)
1173
1174 if timezone is None:
1175 timezone, date_str = extract_timezone(date_str)
1176
1177 with contextlib.suppress(ValueError):
1178 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1179 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1180 return calendar.timegm(dt.timetuple())
1181
1182
1183 def date_formats(day_first=True):
1184 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1185
1186
1187 def unified_strdate(date_str, day_first=True):
1188 """Return a string with the date in the format YYYYMMDD"""
1189
1190 if date_str is None:
1191 return None
1192 upload_date = None
1193 # Replace commas
1194 date_str = date_str.replace(',', ' ')
1195 # Remove AM/PM + timezone
1196 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1197 _, date_str = extract_timezone(date_str)
1198
1199 for expression in date_formats(day_first):
1200 with contextlib.suppress(ValueError):
1201 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1202 if upload_date is None:
1203 timetuple = email.utils.parsedate_tz(date_str)
1204 if timetuple:
1205 with contextlib.suppress(ValueError):
1206 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1207 if upload_date is not None:
1208 return str(upload_date)
1209
1210
1211 def unified_timestamp(date_str, day_first=True):
1212 if not isinstance(date_str, str):
1213 return None
1214
1215 date_str = re.sub(r'\s+', ' ', re.sub(
1216 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1217
1218 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1219 timezone, date_str = extract_timezone(date_str)
1220
1221 # Remove AM/PM + timezone
1222 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1223
1224 # Remove unrecognized timezones from ISO 8601 alike timestamps
1225 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1226 if m:
1227 date_str = date_str[:-len(m.group('tz'))]
1228
1229 # Python only supports microseconds, so remove nanoseconds
1230 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1231 if m:
1232 date_str = m.group(1)
1233
1234 for expression in date_formats(day_first):
1235 with contextlib.suppress(ValueError):
1236 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1237 return calendar.timegm(dt.timetuple())
1238
1239 timetuple = email.utils.parsedate_tz(date_str)
1240 if timetuple:
1241 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1242
1243
1244 def determine_ext(url, default_ext='unknown_video'):
1245 if url is None or '.' not in url:
1246 return default_ext
1247 guess = url.partition('?')[0].rpartition('.')[2]
1248 if re.match(r'^[A-Za-z0-9]+$', guess):
1249 return guess
1250 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1251 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1252 return guess.rstrip('/')
1253 else:
1254 return default_ext
1255
1256
1257 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1258 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1259
1260
1261 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1262 R"""
1263 Return a datetime object from a string.
1264 Supported format:
1265 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1266
1267 @param format strftime format of DATE
1268 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1269 auto: round to the unit provided in date_str (if applicable).
1270 """
1271 auto_precision = False
1272 if precision == 'auto':
1273 auto_precision = True
1274 precision = 'microsecond'
1275 today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
1276 if date_str in ('now', 'today'):
1277 return today
1278 if date_str == 'yesterday':
1279 return today - datetime.timedelta(days=1)
1280 match = re.match(
1281 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1282 date_str)
1283 if match is not None:
1284 start_time = datetime_from_str(match.group('start'), precision, format)
1285 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1286 unit = match.group('unit')
1287 if unit == 'month' or unit == 'year':
1288 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1289 unit = 'day'
1290 else:
1291 if unit == 'week':
1292 unit = 'day'
1293 time *= 7
1294 delta = datetime.timedelta(**{unit + 's': time})
1295 new_date = start_time + delta
1296 if auto_precision:
1297 return datetime_round(new_date, unit)
1298 return new_date
1299
1300 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1301
1302
1303 def date_from_str(date_str, format='%Y%m%d', strict=False):
1304 R"""
1305 Return a date object from a string using datetime_from_str
1306
1307 @param strict Restrict allowed patterns to "YYYYMMDD" and
1308 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1309 """
1310 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1311 raise ValueError(f'Invalid date format "{date_str}"')
1312 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1313
1314
1315 def datetime_add_months(dt, months):
1316 """Increment/Decrement a datetime object by months."""
1317 month = dt.month + months - 1
1318 year = dt.year + month // 12
1319 month = month % 12 + 1
1320 day = min(dt.day, calendar.monthrange(year, month)[1])
1321 return dt.replace(year, month, day)
1322
1323
1324 def datetime_round(dt, precision='day'):
1325 """
1326 Round a datetime object's time to a specific precision
1327 """
1328 if precision == 'microsecond':
1329 return dt
1330
1331 unit_seconds = {
1332 'day': 86400,
1333 'hour': 3600,
1334 'minute': 60,
1335 'second': 1,
1336 }
1337 roundto = lambda x, n: ((x + n / 2) // n) * n
1338 timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1339 return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
1340
1341
1342 def hyphenate_date(date_str):
1343 """
1344 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346 if match is not None:
1347 return '-'.join(match.groups())
1348 else:
1349 return date_str
1350
1351
1352 class DateRange:
1353 """Represents a time interval between two dates"""
1354
1355 def __init__(self, start=None, end=None):
1356 """start and end must be strings in the format accepted by date"""
1357 if start is not None:
1358 self.start = date_from_str(start, strict=True)
1359 else:
1360 self.start = datetime.datetime.min.date()
1361 if end is not None:
1362 self.end = date_from_str(end, strict=True)
1363 else:
1364 self.end = datetime.datetime.max.date()
1365 if self.start > self.end:
1366 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1367
1368 @classmethod
1369 def day(cls, day):
1370 """Returns a range that only contains the given day"""
1371 return cls(day, day)
1372
1373 def __contains__(self, date):
1374 """Check if the date is in the range"""
1375 if not isinstance(date, datetime.date):
1376 date = date_from_str(date)
1377 return self.start <= date <= self.end
1378
1379 def __repr__(self):
1380 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1381
1382 def __eq__(self, other):
1383 return (isinstance(other, DateRange)
1384 and self.start == other.start and self.end == other.end)
1385
1386
1387 @functools.cache
1388 def system_identifier():
1389 python_implementation = platform.python_implementation()
1390 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1391 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1392 libc_ver = []
1393 with contextlib.suppress(OSError): # We may not have access to the executable
1394 libc_ver = platform.libc_ver()
1395
1396 return 'Python %s (%s %s %s) - %s (%s%s)' % (
1397 platform.python_version(),
1398 python_implementation,
1399 platform.machine(),
1400 platform.architecture()[0],
1401 platform.platform(),
1402 ssl.OPENSSL_VERSION,
1403 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1404 )
1405
1406
1407 @functools.cache
1408 def get_windows_version():
1409 ''' Get Windows version. returns () if it's not running on Windows '''
1410 if compat_os_name == 'nt':
1411 return version_tuple(platform.win32_ver()[1])
1412 else:
1413 return ()
1414
1415
1416 def write_string(s, out=None, encoding=None):
1417 assert isinstance(s, str)
1418 out = out or sys.stderr
1419 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1420 if not out:
1421 return
1422
1423 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1424 s = re.sub(r'([\r\n]+)', r' \1', s)
1425
1426 enc, buffer = None, out
1427 # `mode` might be `None` (Ref: https://github.com/yt-dlp/yt-dlp/issues/8816)
1428 if 'b' in (getattr(out, 'mode', None) or ''):
1429 enc = encoding or preferredencoding()
1430 elif hasattr(out, 'buffer'):
1431 buffer = out.buffer
1432 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1433
1434 buffer.write(s.encode(enc, 'ignore') if enc else s)
1435 out.flush()
1436
1437
1438 # TODO: Use global logger
1439 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1440 from .. import _IN_CLI
1441 if _IN_CLI:
1442 if msg in deprecation_warning._cache:
1443 return
1444 deprecation_warning._cache.add(msg)
1445 if printer:
1446 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1447 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1448 else:
1449 import warnings
1450 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1451
1452
1453 deprecation_warning._cache = set()
1454
1455
1456 def bytes_to_intlist(bs):
1457 if not bs:
1458 return []
1459 if isinstance(bs[0], int): # Python 3
1460 return list(bs)
1461 else:
1462 return [ord(c) for c in bs]
1463
1464
1465 def intlist_to_bytes(xs):
1466 if not xs:
1467 return b''
1468 return struct.pack('%dB' % len(xs), *xs)
1469
1470
1471 class LockingUnsupportedError(OSError):
1472 msg = 'File locking is not supported'
1473
1474 def __init__(self):
1475 super().__init__(self.msg)
1476
1477
1478 # Cross-platform file locking
1479 if sys.platform == 'win32':
1480 import ctypes
1481 import ctypes.wintypes
1482 import msvcrt
1483
1484 class OVERLAPPED(ctypes.Structure):
1485 _fields_ = [
1486 ('Internal', ctypes.wintypes.LPVOID),
1487 ('InternalHigh', ctypes.wintypes.LPVOID),
1488 ('Offset', ctypes.wintypes.DWORD),
1489 ('OffsetHigh', ctypes.wintypes.DWORD),
1490 ('hEvent', ctypes.wintypes.HANDLE),
1491 ]
1492
1493 kernel32 = ctypes.WinDLL('kernel32')
1494 LockFileEx = kernel32.LockFileEx
1495 LockFileEx.argtypes = [
1496 ctypes.wintypes.HANDLE, # hFile
1497 ctypes.wintypes.DWORD, # dwFlags
1498 ctypes.wintypes.DWORD, # dwReserved
1499 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1500 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1501 ctypes.POINTER(OVERLAPPED) # Overlapped
1502 ]
1503 LockFileEx.restype = ctypes.wintypes.BOOL
1504 UnlockFileEx = kernel32.UnlockFileEx
1505 UnlockFileEx.argtypes = [
1506 ctypes.wintypes.HANDLE, # hFile
1507 ctypes.wintypes.DWORD, # dwReserved
1508 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1509 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1510 ctypes.POINTER(OVERLAPPED) # Overlapped
1511 ]
1512 UnlockFileEx.restype = ctypes.wintypes.BOOL
1513 whole_low = 0xffffffff
1514 whole_high = 0x7fffffff
1515
1516 def _lock_file(f, exclusive, block):
1517 overlapped = OVERLAPPED()
1518 overlapped.Offset = 0
1519 overlapped.OffsetHigh = 0
1520 overlapped.hEvent = 0
1521 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1522
1523 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1524 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1525 0, whole_low, whole_high, f._lock_file_overlapped_p):
1526 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1527 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1528
1529 def _unlock_file(f):
1530 assert f._lock_file_overlapped_p
1531 handle = msvcrt.get_osfhandle(f.fileno())
1532 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1533 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1534
1535 else:
1536 try:
1537 import fcntl
1538
1539 def _lock_file(f, exclusive, block):
1540 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1541 if not block:
1542 flags |= fcntl.LOCK_NB
1543 try:
1544 fcntl.flock(f, flags)
1545 except BlockingIOError:
1546 raise
1547 except OSError: # AOSP does not have flock()
1548 fcntl.lockf(f, flags)
1549
1550 def _unlock_file(f):
1551 with contextlib.suppress(OSError):
1552 return fcntl.flock(f, fcntl.LOCK_UN)
1553 with contextlib.suppress(OSError):
1554 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1555 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
1556
1557 except ImportError:
1558
1559 def _lock_file(f, exclusive, block):
1560 raise LockingUnsupportedError()
1561
1562 def _unlock_file(f):
1563 raise LockingUnsupportedError()
1564
1565
1566 class locked_file:
1567 locked = False
1568
1569 def __init__(self, filename, mode, block=True, encoding=None):
1570 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1571 raise NotImplementedError(mode)
1572 self.mode, self.block = mode, block
1573
1574 writable = any(f in mode for f in 'wax+')
1575 readable = any(f in mode for f in 'r+')
1576 flags = functools.reduce(operator.ior, (
1577 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1578 getattr(os, 'O_BINARY', 0), # Windows only
1579 getattr(os, 'O_NOINHERIT', 0), # Windows only
1580 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1581 os.O_APPEND if 'a' in mode else 0,
1582 os.O_EXCL if 'x' in mode else 0,
1583 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1584 ))
1585
1586 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1587
1588 def __enter__(self):
1589 exclusive = 'r' not in self.mode
1590 try:
1591 _lock_file(self.f, exclusive, self.block)
1592 self.locked = True
1593 except OSError:
1594 self.f.close()
1595 raise
1596 if 'w' in self.mode:
1597 try:
1598 self.f.truncate()
1599 except OSError as e:
1600 if e.errno not in (
1601 errno.ESPIPE, # Illegal seek - expected for FIFO
1602 errno.EINVAL, # Invalid argument - expected for /dev/null
1603 ):
1604 raise
1605 return self
1606
1607 def unlock(self):
1608 if not self.locked:
1609 return
1610 try:
1611 _unlock_file(self.f)
1612 finally:
1613 self.locked = False
1614
1615 def __exit__(self, *_):
1616 try:
1617 self.unlock()
1618 finally:
1619 self.f.close()
1620
1621 open = __enter__
1622 close = __exit__
1623
1624 def __getattr__(self, attr):
1625 return getattr(self.f, attr)
1626
1627 def __iter__(self):
1628 return iter(self.f)
1629
1630
1631 @functools.cache
1632 def get_filesystem_encoding():
1633 encoding = sys.getfilesystemencoding()
1634 return encoding if encoding is not None else 'utf-8'
1635
1636
1637 def shell_quote(args):
1638 quoted_args = []
1639 encoding = get_filesystem_encoding()
1640 for a in args:
1641 if isinstance(a, bytes):
1642 # We may get a filename encoded with 'encodeFilename'
1643 a = a.decode(encoding)
1644 quoted_args.append(compat_shlex_quote(a))
1645 return ' '.join(quoted_args)
1646
1647
1648 def smuggle_url(url, data):
1649 """ Pass additional data in a URL for internal use. """
1650
1651 url, idata = unsmuggle_url(url, {})
1652 data.update(idata)
1653 sdata = urllib.parse.urlencode(
1654 {'__youtubedl_smuggle': json.dumps(data)})
1655 return url + '#' + sdata
1656
1657
1658 def unsmuggle_url(smug_url, default=None):
1659 if '#__youtubedl_smuggle' not in smug_url:
1660 return smug_url, default
1661 url, _, sdata = smug_url.rpartition('#')
1662 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1663 data = json.loads(jsond)
1664 return url, data
1665
1666
1667 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1668 """ Formats numbers with decimal sufixes like K, M, etc """
1669 num, factor = float_or_none(num), float(factor)
1670 if num is None or num < 0:
1671 return None
1672 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1673 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1674 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1675 if factor == 1024:
1676 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1677 converted = num / (factor ** exponent)
1678 return fmt % (converted, suffix)
1679
1680
1681 def format_bytes(bytes):
1682 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1683
1684
1685 def lookup_unit_table(unit_table, s, strict=False):
1686 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1687 units_re = '|'.join(re.escape(u) for u in unit_table)
1688 m = (re.fullmatch if strict else re.match)(
1689 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1690 if not m:
1691 return None
1692
1693 num = float(m.group('num').replace(',', '.'))
1694 mult = unit_table[m.group('unit')]
1695 return round(num * mult)
1696
1697
1698 def parse_bytes(s):
1699 """Parse a string indicating a byte quantity into an integer"""
1700 return lookup_unit_table(
1701 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1702 s.upper(), strict=True)
1703
1704
1705 def parse_filesize(s):
1706 if s is None:
1707 return None
1708
1709 # The lower-case forms are of course incorrect and unofficial,
1710 # but we support those too
1711 _UNIT_TABLE = {
1712 'B': 1,
1713 'b': 1,
1714 'bytes': 1,
1715 'KiB': 1024,
1716 'KB': 1000,
1717 'kB': 1024,
1718 'Kb': 1000,
1719 'kb': 1000,
1720 'kilobytes': 1000,
1721 'kibibytes': 1024,
1722 'MiB': 1024 ** 2,
1723 'MB': 1000 ** 2,
1724 'mB': 1024 ** 2,
1725 'Mb': 1000 ** 2,
1726 'mb': 1000 ** 2,
1727 'megabytes': 1000 ** 2,
1728 'mebibytes': 1024 ** 2,
1729 'GiB': 1024 ** 3,
1730 'GB': 1000 ** 3,
1731 'gB': 1024 ** 3,
1732 'Gb': 1000 ** 3,
1733 'gb': 1000 ** 3,
1734 'gigabytes': 1000 ** 3,
1735 'gibibytes': 1024 ** 3,
1736 'TiB': 1024 ** 4,
1737 'TB': 1000 ** 4,
1738 'tB': 1024 ** 4,
1739 'Tb': 1000 ** 4,
1740 'tb': 1000 ** 4,
1741 'terabytes': 1000 ** 4,
1742 'tebibytes': 1024 ** 4,
1743 'PiB': 1024 ** 5,
1744 'PB': 1000 ** 5,
1745 'pB': 1024 ** 5,
1746 'Pb': 1000 ** 5,
1747 'pb': 1000 ** 5,
1748 'petabytes': 1000 ** 5,
1749 'pebibytes': 1024 ** 5,
1750 'EiB': 1024 ** 6,
1751 'EB': 1000 ** 6,
1752 'eB': 1024 ** 6,
1753 'Eb': 1000 ** 6,
1754 'eb': 1000 ** 6,
1755 'exabytes': 1000 ** 6,
1756 'exbibytes': 1024 ** 6,
1757 'ZiB': 1024 ** 7,
1758 'ZB': 1000 ** 7,
1759 'zB': 1024 ** 7,
1760 'Zb': 1000 ** 7,
1761 'zb': 1000 ** 7,
1762 'zettabytes': 1000 ** 7,
1763 'zebibytes': 1024 ** 7,
1764 'YiB': 1024 ** 8,
1765 'YB': 1000 ** 8,
1766 'yB': 1024 ** 8,
1767 'Yb': 1000 ** 8,
1768 'yb': 1000 ** 8,
1769 'yottabytes': 1000 ** 8,
1770 'yobibytes': 1024 ** 8,
1771 }
1772
1773 return lookup_unit_table(_UNIT_TABLE, s)
1774
1775
1776 def parse_count(s):
1777 if s is None:
1778 return None
1779
1780 s = re.sub(r'^[^\d]+\s', '', s).strip()
1781
1782 if re.match(r'^[\d,.]+$', s):
1783 return str_to_int(s)
1784
1785 _UNIT_TABLE = {
1786 'k': 1000,
1787 'K': 1000,
1788 'm': 1000 ** 2,
1789 'M': 1000 ** 2,
1790 'kk': 1000 ** 2,
1791 'KK': 1000 ** 2,
1792 'b': 1000 ** 3,
1793 'B': 1000 ** 3,
1794 }
1795
1796 ret = lookup_unit_table(_UNIT_TABLE, s)
1797 if ret is not None:
1798 return ret
1799
1800 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1801 if mobj:
1802 return str_to_int(mobj.group(1))
1803
1804
1805 def parse_resolution(s, *, lenient=False):
1806 if s is None:
1807 return {}
1808
1809 if lenient:
1810 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1811 else:
1812 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1813 if mobj:
1814 return {
1815 'width': int(mobj.group('w')),
1816 'height': int(mobj.group('h')),
1817 }
1818
1819 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1820 if mobj:
1821 return {'height': int(mobj.group(1))}
1822
1823 mobj = re.search(r'\b([48])[kK]\b', s)
1824 if mobj:
1825 return {'height': int(mobj.group(1)) * 540}
1826
1827 return {}
1828
1829
1830 def parse_bitrate(s):
1831 if not isinstance(s, str):
1832 return
1833 mobj = re.search(r'\b(\d+)\s*kbps', s)
1834 if mobj:
1835 return int(mobj.group(1))
1836
1837
1838 def month_by_name(name, lang='en'):
1839 """ Return the number of a month by (locale-independently) English name """
1840
1841 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1842
1843 try:
1844 return month_names.index(name) + 1
1845 except ValueError:
1846 return None
1847
1848
1849 def month_by_abbreviation(abbrev):
1850 """ Return the number of a month by (locale-independently) English
1851 abbreviations """
1852
1853 try:
1854 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1855 except ValueError:
1856 return None
1857
1858
1859 def fix_xml_ampersands(xml_str):
1860 """Replace all the '&' by '&amp;' in XML"""
1861 return re.sub(
1862 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1863 '&amp;',
1864 xml_str)
1865
1866
1867 def setproctitle(title):
1868 assert isinstance(title, str)
1869
1870 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1871 try:
1872 import ctypes
1873 except ImportError:
1874 return
1875
1876 try:
1877 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1878 except OSError:
1879 return
1880 except TypeError:
1881 # LoadLibrary in Windows Python 2.7.13 only expects
1882 # a bytestring, but since unicode_literals turns
1883 # every string into a unicode string, it fails.
1884 return
1885 title_bytes = title.encode()
1886 buf = ctypes.create_string_buffer(len(title_bytes))
1887 buf.value = title_bytes
1888 try:
1889 # PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h
1890 libc.prctl(15, buf, 0, 0, 0)
1891 except AttributeError:
1892 return # Strange libc, just skip this
1893
1894
1895 def remove_start(s, start):
1896 return s[len(start):] if s is not None and s.startswith(start) else s
1897
1898
1899 def remove_end(s, end):
1900 return s[:-len(end)] if s is not None and s.endswith(end) else s
1901
1902
1903 def remove_quotes(s):
1904 if s is None or len(s) < 2:
1905 return s
1906 for quote in ('"', "'", ):
1907 if s[0] == quote and s[-1] == quote:
1908 return s[1:-1]
1909 return s
1910
1911
1912 def get_domain(url):
1913 """
1914 This implementation is inconsistent, but is kept for compatibility.
1915 Use this only for "webpage_url_domain"
1916 """
1917 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1918
1919
1920 def url_basename(url):
1921 path = urllib.parse.urlparse(url).path
1922 return path.strip('/').split('/')[-1]
1923
1924
1925 def base_url(url):
1926 return re.match(r'https?://[^?#]+/', url).group()
1927
1928
1929 def urljoin(base, path):
1930 if isinstance(path, bytes):
1931 path = path.decode()
1932 if not isinstance(path, str) or not path:
1933 return None
1934 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1935 return path
1936 if isinstance(base, bytes):
1937 base = base.decode()
1938 if not isinstance(base, str) or not re.match(
1939 r'^(?:https?:)?//', base):
1940 return None
1941 return urllib.parse.urljoin(base, path)
1942
1943
1944 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1945 if get_attr and v is not None:
1946 v = getattr(v, get_attr, None)
1947 try:
1948 return int(v) * invscale // scale
1949 except (ValueError, TypeError, OverflowError):
1950 return default
1951
1952
1953 def str_or_none(v, default=None):
1954 return default if v is None else str(v)
1955
1956
1957 def str_to_int(int_str):
1958 """ A more relaxed version of int_or_none """
1959 if isinstance(int_str, int):
1960 return int_str
1961 elif isinstance(int_str, str):
1962 int_str = re.sub(r'[,\.\+]', '', int_str)
1963 return int_or_none(int_str)
1964
1965
1966 def float_or_none(v, scale=1, invscale=1, default=None):
1967 if v is None:
1968 return default
1969 try:
1970 return float(v) * invscale / scale
1971 except (ValueError, TypeError):
1972 return default
1973
1974
1975 def bool_or_none(v, default=None):
1976 return v if isinstance(v, bool) else default
1977
1978
1979 def strip_or_none(v, default=None):
1980 return v.strip() if isinstance(v, str) else default
1981
1982
1983 def url_or_none(url):
1984 if not url or not isinstance(url, str):
1985 return None
1986 url = url.strip()
1987 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1988
1989
1990 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1991 datetime_object = None
1992 try:
1993 if isinstance(timestamp, (int, float)): # unix timestamp
1994 # Using naive datetime here can break timestamp() in Windows
1995 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1996 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1997 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1998 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1999 + datetime.timedelta(seconds=timestamp))
2000 elif isinstance(timestamp, str): # assume YYYYMMDD
2001 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2002 date_format = re.sub( # Support %s on windows
2003 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2004 return datetime_object.strftime(date_format)
2005 except (ValueError, TypeError, AttributeError):
2006 return default
2007
2008
2009 def parse_duration(s):
2010 if not isinstance(s, str):
2011 return None
2012 s = s.strip()
2013 if not s:
2014 return None
2015
2016 days, hours, mins, secs, ms = [None] * 5
2017 m = re.match(r'''(?x)
2018 (?P<before_secs>
2019 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2020 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2021 (?P<ms>[.:][0-9]+)?Z?$
2022 ''', s)
2023 if m:
2024 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2025 else:
2026 m = re.match(
2027 r'''(?ix)(?:P?
2028 (?:
2029 [0-9]+\s*y(?:ears?)?,?\s*
2030 )?
2031 (?:
2032 [0-9]+\s*m(?:onths?)?,?\s*
2033 )?
2034 (?:
2035 [0-9]+\s*w(?:eeks?)?,?\s*
2036 )?
2037 (?:
2038 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2039 )?
2040 T)?
2041 (?:
2042 (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2043 )?
2044 (?:
2045 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2046 )?
2047 (?:
2048 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2049 )?Z?$''', s)
2050 if m:
2051 days, hours, mins, secs, ms = m.groups()
2052 else:
2053 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2054 if m:
2055 hours, mins = m.groups()
2056 else:
2057 return None
2058
2059 if ms:
2060 ms = ms.replace(':', '.')
2061 return sum(float(part or 0) * mult for part, mult in (
2062 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2063
2064
2065 def prepend_extension(filename, ext, expected_real_ext=None):
2066 name, real_ext = os.path.splitext(filename)
2067 return (
2068 f'{name}.{ext}{real_ext}'
2069 if not expected_real_ext or real_ext[1:] == expected_real_ext
2070 else f'{filename}.{ext}')
2071
2072
2073 def replace_extension(filename, ext, expected_real_ext=None):
2074 name, real_ext = os.path.splitext(filename)
2075 return '{}.{}'.format(
2076 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2077 ext)
2078
2079
2080 def check_executable(exe, args=[]):
2081 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2082 args can be a list of arguments for a short output (like -version) """
2083 try:
2084 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2085 except OSError:
2086 return False
2087 return exe
2088
2089
2090 def _get_exe_version_output(exe, args):
2091 try:
2092 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2093 # SIGTTOU if yt-dlp is run in the background.
2094 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2095 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2096 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2097 if ret:
2098 return None
2099 except OSError:
2100 return False
2101 return stdout
2102
2103
2104 def detect_exe_version(output, version_re=None, unrecognized='present'):
2105 assert isinstance(output, str)
2106 if version_re is None:
2107 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2108 m = re.search(version_re, output)
2109 if m:
2110 return m.group(1)
2111 else:
2112 return unrecognized
2113
2114
2115 def get_exe_version(exe, args=['--version'],
2116 version_re=None, unrecognized=('present', 'broken')):
2117 """ Returns the version of the specified executable,
2118 or False if the executable is not present """
2119 unrecognized = variadic(unrecognized)
2120 assert len(unrecognized) in (1, 2)
2121 out = _get_exe_version_output(exe, args)
2122 if out is None:
2123 return unrecognized[-1]
2124 return out and detect_exe_version(out, version_re, unrecognized[0])
2125
2126
2127 def frange(start=0, stop=None, step=1):
2128 """Float range"""
2129 if stop is None:
2130 start, stop = 0, start
2131 sign = [-1, 1][step > 0] if step else 0
2132 while sign * start < sign * stop:
2133 yield start
2134 start += step
2135
2136
2137 class LazyList(collections.abc.Sequence):
2138 """Lazy immutable list from an iterable
2139 Note that slices of a LazyList are lists and not LazyList"""
2140
2141 class IndexError(IndexError):
2142 pass
2143
2144 def __init__(self, iterable, *, reverse=False, _cache=None):
2145 self._iterable = iter(iterable)
2146 self._cache = [] if _cache is None else _cache
2147 self._reversed = reverse
2148
2149 def __iter__(self):
2150 if self._reversed:
2151 # We need to consume the entire iterable to iterate in reverse
2152 yield from self.exhaust()
2153 return
2154 yield from self._cache
2155 for item in self._iterable:
2156 self._cache.append(item)
2157 yield item
2158
2159 def _exhaust(self):
2160 self._cache.extend(self._iterable)
2161 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2162 return self._cache
2163
2164 def exhaust(self):
2165 """Evaluate the entire iterable"""
2166 return self._exhaust()[::-1 if self._reversed else 1]
2167
2168 @staticmethod
2169 def _reverse_index(x):
2170 return None if x is None else ~x
2171
2172 def __getitem__(self, idx):
2173 if isinstance(idx, slice):
2174 if self._reversed:
2175 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2176 start, stop, step = idx.start, idx.stop, idx.step or 1
2177 elif isinstance(idx, int):
2178 if self._reversed:
2179 idx = self._reverse_index(idx)
2180 start, stop, step = idx, idx, 0
2181 else:
2182 raise TypeError('indices must be integers or slices')
2183 if ((start or 0) < 0 or (stop or 0) < 0
2184 or (start is None and step < 0)
2185 or (stop is None and step > 0)):
2186 # We need to consume the entire iterable to be able to slice from the end
2187 # Obviously, never use this with infinite iterables
2188 self._exhaust()
2189 try:
2190 return self._cache[idx]
2191 except IndexError as e:
2192 raise self.IndexError(e) from e
2193 n = max(start or 0, stop or 0) - len(self._cache) + 1
2194 if n > 0:
2195 self._cache.extend(itertools.islice(self._iterable, n))
2196 try:
2197 return self._cache[idx]
2198 except IndexError as e:
2199 raise self.IndexError(e) from e
2200
2201 def __bool__(self):
2202 try:
2203 self[-1] if self._reversed else self[0]
2204 except self.IndexError:
2205 return False
2206 return True
2207
2208 def __len__(self):
2209 self._exhaust()
2210 return len(self._cache)
2211
2212 def __reversed__(self):
2213 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2214
2215 def __copy__(self):
2216 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2217
2218 def __repr__(self):
2219 # repr and str should mimic a list. So we exhaust the iterable
2220 return repr(self.exhaust())
2221
2222 def __str__(self):
2223 return repr(self.exhaust())
2224
2225
2226 class PagedList:
2227
2228 class IndexError(IndexError):
2229 pass
2230
2231 def __len__(self):
2232 # This is only useful for tests
2233 return len(self.getslice())
2234
2235 def __init__(self, pagefunc, pagesize, use_cache=True):
2236 self._pagefunc = pagefunc
2237 self._pagesize = pagesize
2238 self._pagecount = float('inf')
2239 self._use_cache = use_cache
2240 self._cache = {}
2241
2242 def getpage(self, pagenum):
2243 page_results = self._cache.get(pagenum)
2244 if page_results is None:
2245 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2246 if self._use_cache:
2247 self._cache[pagenum] = page_results
2248 return page_results
2249
2250 def getslice(self, start=0, end=None):
2251 return list(self._getslice(start, end))
2252
2253 def _getslice(self, start, end):
2254 raise NotImplementedError('This method must be implemented by subclasses')
2255
2256 def __getitem__(self, idx):
2257 assert self._use_cache, 'Indexing PagedList requires cache'
2258 if not isinstance(idx, int) or idx < 0:
2259 raise TypeError('indices must be non-negative integers')
2260 entries = self.getslice(idx, idx + 1)
2261 if not entries:
2262 raise self.IndexError()
2263 return entries[0]
2264
2265 def __bool__(self):
2266 return bool(self.getslice(0, 1))
2267
2268
2269 class OnDemandPagedList(PagedList):
2270 """Download pages until a page with less than maximum results"""
2271
2272 def _getslice(self, start, end):
2273 for pagenum in itertools.count(start // self._pagesize):
2274 firstid = pagenum * self._pagesize
2275 nextfirstid = pagenum * self._pagesize + self._pagesize
2276 if start >= nextfirstid:
2277 continue
2278
2279 startv = (
2280 start % self._pagesize
2281 if firstid <= start < nextfirstid
2282 else 0)
2283 endv = (
2284 ((end - 1) % self._pagesize) + 1
2285 if (end is not None and firstid <= end <= nextfirstid)
2286 else None)
2287
2288 try:
2289 page_results = self.getpage(pagenum)
2290 except Exception:
2291 self._pagecount = pagenum - 1
2292 raise
2293 if startv != 0 or endv is not None:
2294 page_results = page_results[startv:endv]
2295 yield from page_results
2296
2297 # A little optimization - if current page is not "full", ie. does
2298 # not contain page_size videos then we can assume that this page
2299 # is the last one - there are no more ids on further pages -
2300 # i.e. no need to query again.
2301 if len(page_results) + startv < self._pagesize:
2302 break
2303
2304 # If we got the whole page, but the next page is not interesting,
2305 # break out early as well
2306 if end == nextfirstid:
2307 break
2308
2309
2310 class InAdvancePagedList(PagedList):
2311 """PagedList with total number of pages known in advance"""
2312
2313 def __init__(self, pagefunc, pagecount, pagesize):
2314 PagedList.__init__(self, pagefunc, pagesize, True)
2315 self._pagecount = pagecount
2316
2317 def _getslice(self, start, end):
2318 start_page = start // self._pagesize
2319 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2320 skip_elems = start - start_page * self._pagesize
2321 only_more = None if end is None else end - start
2322 for pagenum in range(start_page, end_page):
2323 page_results = self.getpage(pagenum)
2324 if skip_elems:
2325 page_results = page_results[skip_elems:]
2326 skip_elems = None
2327 if only_more is not None:
2328 if len(page_results) < only_more:
2329 only_more -= len(page_results)
2330 else:
2331 yield from page_results[:only_more]
2332 break
2333 yield from page_results
2334
2335
2336 class PlaylistEntries:
2337 MissingEntry = object()
2338 is_exhausted = False
2339
2340 def __init__(self, ydl, info_dict):
2341 self.ydl = ydl
2342
2343 # _entries must be assigned now since infodict can change during iteration
2344 entries = info_dict.get('entries')
2345 if entries is None:
2346 raise EntryNotInPlaylist('There are no entries')
2347 elif isinstance(entries, list):
2348 self.is_exhausted = True
2349
2350 requested_entries = info_dict.get('requested_entries')
2351 self.is_incomplete = requested_entries is not None
2352 if self.is_incomplete:
2353 assert self.is_exhausted
2354 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2355 for i, entry in zip(requested_entries, entries):
2356 self._entries[i - 1] = entry
2357 elif isinstance(entries, (list, PagedList, LazyList)):
2358 self._entries = entries
2359 else:
2360 self._entries = LazyList(entries)
2361
2362 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2363 (?P<start>[+-]?\d+)?
2364 (?P<range>[:-]
2365 (?P<end>[+-]?\d+|inf(?:inite)?)?
2366 (?::(?P<step>[+-]?\d+))?
2367 )?''')
2368
2369 @classmethod
2370 def parse_playlist_items(cls, string):
2371 for segment in string.split(','):
2372 if not segment:
2373 raise ValueError('There is two or more consecutive commas')
2374 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2375 if not mobj:
2376 raise ValueError(f'{segment!r} is not a valid specification')
2377 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2378 if int_or_none(step) == 0:
2379 raise ValueError(f'Step in {segment!r} cannot be zero')
2380 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2381
2382 def get_requested_items(self):
2383 playlist_items = self.ydl.params.get('playlist_items')
2384 playlist_start = self.ydl.params.get('playliststart', 1)
2385 playlist_end = self.ydl.params.get('playlistend')
2386 # For backwards compatibility, interpret -1 as whole list
2387 if playlist_end in (-1, None):
2388 playlist_end = ''
2389 if not playlist_items:
2390 playlist_items = f'{playlist_start}:{playlist_end}'
2391 elif playlist_start != 1 or playlist_end:
2392 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2393
2394 for index in self.parse_playlist_items(playlist_items):
2395 for i, entry in self[index]:
2396 yield i, entry
2397 if not entry:
2398 continue
2399 try:
2400 # The item may have just been added to archive. Don't break due to it
2401 if not self.ydl.params.get('lazy_playlist'):
2402 # TODO: Add auto-generated fields
2403 self.ydl._match_entry(entry, incomplete=True, silent=True)
2404 except (ExistingVideoReached, RejectedVideoReached):
2405 return
2406
2407 def get_full_count(self):
2408 if self.is_exhausted and not self.is_incomplete:
2409 return len(self)
2410 elif isinstance(self._entries, InAdvancePagedList):
2411 if self._entries._pagesize == 1:
2412 return self._entries._pagecount
2413
2414 @functools.cached_property
2415 def _getter(self):
2416 if isinstance(self._entries, list):
2417 def get_entry(i):
2418 try:
2419 entry = self._entries[i]
2420 except IndexError:
2421 entry = self.MissingEntry
2422 if not self.is_incomplete:
2423 raise self.IndexError()
2424 if entry is self.MissingEntry:
2425 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2426 return entry
2427 else:
2428 def get_entry(i):
2429 try:
2430 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2431 except (LazyList.IndexError, PagedList.IndexError):
2432 raise self.IndexError()
2433 return get_entry
2434
2435 def __getitem__(self, idx):
2436 if isinstance(idx, int):
2437 idx = slice(idx, idx)
2438
2439 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2440 step = 1 if idx.step is None else idx.step
2441 if idx.start is None:
2442 start = 0 if step > 0 else len(self) - 1
2443 else:
2444 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2445
2446 # NB: Do not call len(self) when idx == [:]
2447 if idx.stop is None:
2448 stop = 0 if step < 0 else float('inf')
2449 else:
2450 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2451 stop += [-1, 1][step > 0]
2452
2453 for i in frange(start, stop, step):
2454 if i < 0:
2455 continue
2456 try:
2457 entry = self._getter(i)
2458 except self.IndexError:
2459 self.is_exhausted = True
2460 if step > 0:
2461 break
2462 continue
2463 yield i + 1, entry
2464
2465 def __len__(self):
2466 return len(tuple(self[:]))
2467
2468 class IndexError(IndexError):
2469 pass
2470
2471
2472 def uppercase_escape(s):
2473 unicode_escape = codecs.getdecoder('unicode_escape')
2474 return re.sub(
2475 r'\\U[0-9a-fA-F]{8}',
2476 lambda m: unicode_escape(m.group(0))[0],
2477 s)
2478
2479
2480 def lowercase_escape(s):
2481 unicode_escape = codecs.getdecoder('unicode_escape')
2482 return re.sub(
2483 r'\\u[0-9a-fA-F]{4}',
2484 lambda m: unicode_escape(m.group(0))[0],
2485 s)
2486
2487
2488 def parse_qs(url, **kwargs):
2489 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2490
2491
2492 def read_batch_urls(batch_fd):
2493 def fixup(url):
2494 if not isinstance(url, str):
2495 url = url.decode('utf-8', 'replace')
2496 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2497 for bom in BOM_UTF8:
2498 if url.startswith(bom):
2499 url = url[len(bom):]
2500 url = url.lstrip()
2501 if not url or url.startswith(('#', ';', ']')):
2502 return False
2503 # "#" cannot be stripped out since it is part of the URI
2504 # However, it can be safely stripped out if following a whitespace
2505 return re.split(r'\s#', url, 1)[0].rstrip()
2506
2507 with contextlib.closing(batch_fd) as fd:
2508 return [url for url in map(fixup, fd) if url]
2509
2510
2511 def urlencode_postdata(*args, **kargs):
2512 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2513
2514
2515 def update_url(url, *, query_update=None, **kwargs):
2516 """Replace URL components specified by kwargs
2517 @param url str or parse url tuple
2518 @param query_update update query
2519 @returns str
2520 """
2521 if isinstance(url, str):
2522 if not kwargs and not query_update:
2523 return url
2524 else:
2525 url = urllib.parse.urlparse(url)
2526 if query_update:
2527 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2528 kwargs['query'] = urllib.parse.urlencode({
2529 **urllib.parse.parse_qs(url.query),
2530 **query_update
2531 }, True)
2532 return urllib.parse.urlunparse(url._replace(**kwargs))
2533
2534
2535 def update_url_query(url, query):
2536 return update_url(url, query_update=query)
2537
2538
2539 def _multipart_encode_impl(data, boundary):
2540 content_type = 'multipart/form-data; boundary=%s' % boundary
2541
2542 out = b''
2543 for k, v in data.items():
2544 out += b'--' + boundary.encode('ascii') + b'\r\n'
2545 if isinstance(k, str):
2546 k = k.encode()
2547 if isinstance(v, str):
2548 v = v.encode()
2549 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2550 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2551 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2552 if boundary.encode('ascii') in content:
2553 raise ValueError('Boundary overlaps with data')
2554 out += content
2555
2556 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2557
2558 return out, content_type
2559
2560
2561 def multipart_encode(data, boundary=None):
2562 '''
2563 Encode a dict to RFC 7578-compliant form-data
2564
2565 data:
2566 A dict where keys and values can be either Unicode or bytes-like
2567 objects.
2568 boundary:
2569 If specified a Unicode object, it's used as the boundary. Otherwise
2570 a random boundary is generated.
2571
2572 Reference: https://tools.ietf.org/html/rfc7578
2573 '''
2574 has_specified_boundary = boundary is not None
2575
2576 while True:
2577 if boundary is None:
2578 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2579
2580 try:
2581 out, content_type = _multipart_encode_impl(data, boundary)
2582 break
2583 except ValueError:
2584 if has_specified_boundary:
2585 raise
2586 boundary = None
2587
2588 return out, content_type
2589
2590
2591 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2592 if blocked_types is NO_DEFAULT:
2593 blocked_types = (str, bytes, collections.abc.Mapping)
2594 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2595
2596
2597 def variadic(x, allowed_types=NO_DEFAULT):
2598 if not isinstance(allowed_types, (tuple, type)):
2599 deprecation_warning('allowed_types should be a tuple or a type')
2600 allowed_types = tuple(allowed_types)
2601 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2602
2603
2604 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2605 for f in funcs:
2606 try:
2607 val = f(*args, **kwargs)
2608 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2609 pass
2610 else:
2611 if expected_type is None or isinstance(val, expected_type):
2612 return val
2613
2614
2615 def try_get(src, getter, expected_type=None):
2616 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2617
2618
2619 def filter_dict(dct, cndn=lambda _, v: v is not None):
2620 return {k: v for k, v in dct.items() if cndn(k, v)}
2621
2622
2623 def merge_dicts(*dicts):
2624 merged = {}
2625 for a_dict in dicts:
2626 for k, v in a_dict.items():
2627 if (v is not None and k not in merged
2628 or isinstance(v, str) and merged[k] == ''):
2629 merged[k] = v
2630 return merged
2631
2632
2633 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2634 return string if isinstance(string, str) else str(string, encoding, errors)
2635
2636
2637 US_RATINGS = {
2638 'G': 0,
2639 'PG': 10,
2640 'PG-13': 13,
2641 'R': 16,
2642 'NC': 18,
2643 }
2644
2645
2646 TV_PARENTAL_GUIDELINES = {
2647 'TV-Y': 0,
2648 'TV-Y7': 7,
2649 'TV-G': 0,
2650 'TV-PG': 0,
2651 'TV-14': 14,
2652 'TV-MA': 17,
2653 }
2654
2655
2656 def parse_age_limit(s):
2657 # isinstance(False, int) is True. So type() must be used instead
2658 if type(s) is int: # noqa: E721
2659 return s if 0 <= s <= 21 else None
2660 elif not isinstance(s, str):
2661 return None
2662 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2663 if m:
2664 return int(m.group('age'))
2665 s = s.upper()
2666 if s in US_RATINGS:
2667 return US_RATINGS[s]
2668 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2669 if m:
2670 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2671 return None
2672
2673
2674 def strip_jsonp(code):
2675 return re.sub(
2676 r'''(?sx)^
2677 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2678 (?:\s*&&\s*(?P=func_name))?
2679 \s*\(\s*(?P<callback_data>.*)\);?
2680 \s*?(?://[^\n]*)*$''',
2681 r'\g<callback_data>', code)
2682
2683
2684 def js_to_json(code, vars={}, *, strict=False):
2685 # vars is a dict of var, val pairs to substitute
2686 STRING_QUOTES = '\'"`'
2687 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2688 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2689 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2690 INTEGER_TABLE = (
2691 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2692 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2693 )
2694
2695 def process_escape(match):
2696 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2697 escape = match.group(1) or match.group(2)
2698
2699 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2700 else R'\u00' if escape == 'x'
2701 else '' if escape == '\n'
2702 else escape)
2703
2704 def template_substitute(match):
2705 evaluated = js_to_json(match.group(1), vars, strict=strict)
2706 if evaluated[0] == '"':
2707 return json.loads(evaluated)
2708 return evaluated
2709
2710 def fix_kv(m):
2711 v = m.group(0)
2712 if v in ('true', 'false', 'null'):
2713 return v
2714 elif v in ('undefined', 'void 0'):
2715 return 'null'
2716 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2717 return ''
2718
2719 if v[0] in STRING_QUOTES:
2720 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2721 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2722 return f'"{escaped}"'
2723
2724 for regex, base in INTEGER_TABLE:
2725 im = re.match(regex, v)
2726 if im:
2727 i = int(im.group(1), base)
2728 return f'"{i}":' if v.endswith(':') else str(i)
2729
2730 if v in vars:
2731 try:
2732 if not strict:
2733 json.loads(vars[v])
2734 except json.JSONDecodeError:
2735 return json.dumps(vars[v])
2736 else:
2737 return vars[v]
2738
2739 if not strict:
2740 return f'"{v}"'
2741
2742 raise ValueError(f'Unknown value: {v}')
2743
2744 def create_map(mobj):
2745 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2746
2747 code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2748 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2749 if not strict:
2750 code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2751 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2752 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2753 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2754
2755 return re.sub(rf'''(?sx)
2756 {STRING_RE}|
2757 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2758 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2759 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2760 [0-9]+(?={SKIP_RE}:)|
2761 !+
2762 ''', fix_kv, code)
2763
2764
2765 def qualities(quality_ids):
2766 """ Get a numeric quality value out of a list of possible values """
2767 def q(qid):
2768 try:
2769 return quality_ids.index(qid)
2770 except ValueError:
2771 return -1
2772 return q
2773
2774
2775 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2776
2777
2778 DEFAULT_OUTTMPL = {
2779 'default': '%(title)s [%(id)s].%(ext)s',
2780 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2781 }
2782 OUTTMPL_TYPES = {
2783 'chapter': None,
2784 'subtitle': None,
2785 'thumbnail': None,
2786 'description': 'description',
2787 'annotation': 'annotations.xml',
2788 'infojson': 'info.json',
2789 'link': None,
2790 'pl_video': None,
2791 'pl_thumbnail': None,
2792 'pl_description': 'description',
2793 'pl_infojson': 'info.json',
2794 }
2795
2796 # As of [1] format syntax is:
2797 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2798 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2799 STR_FORMAT_RE_TMPL = r'''(?x)
2800 (?<!%)(?P<prefix>(?:%%)*)
2801 %
2802 (?P<has_key>\((?P<key>{0})\))?
2803 (?P<format>
2804 (?P<conversion>[#0\-+ ]+)?
2805 (?P<min_width>\d+)?
2806 (?P<precision>\.\d+)?
2807 (?P<len_mod>[hlL])? # unused in python
2808 {1} # conversion type
2809 )
2810 '''
2811
2812
2813 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2814
2815
2816 def limit_length(s, length):
2817 """ Add ellipses to overly long strings """
2818 if s is None:
2819 return None
2820 ELLIPSES = '...'
2821 if len(s) > length:
2822 return s[:length - len(ELLIPSES)] + ELLIPSES
2823 return s
2824
2825
2826 def version_tuple(v):
2827 return tuple(int(e) for e in re.split(r'[-.]', v))
2828
2829
2830 def is_outdated_version(version, limit, assume_new=True):
2831 if not version:
2832 return not assume_new
2833 try:
2834 return version_tuple(version) < version_tuple(limit)
2835 except ValueError:
2836 return not assume_new
2837
2838
2839 def ytdl_is_updateable():
2840 """ Returns if yt-dlp can be updated with -U """
2841
2842 from ..update import is_non_updateable
2843
2844 return not is_non_updateable()
2845
2846
2847 def args_to_str(args):
2848 # Get a short string representation for a subprocess command
2849 return ' '.join(compat_shlex_quote(a) for a in args)
2850
2851
2852 def error_to_str(err):
2853 return f'{type(err).__name__}: {err}'
2854
2855
2856 def mimetype2ext(mt, default=NO_DEFAULT):
2857 if not isinstance(mt, str):
2858 if default is not NO_DEFAULT:
2859 return default
2860 return None
2861
2862 MAP = {
2863 # video
2864 '3gpp': '3gp',
2865 'mp2t': 'ts',
2866 'mp4': 'mp4',
2867 'mpeg': 'mpeg',
2868 'mpegurl': 'm3u8',
2869 'quicktime': 'mov',
2870 'webm': 'webm',
2871 'vp9': 'vp9',
2872 'video/ogg': 'ogv',
2873 'x-flv': 'flv',
2874 'x-m4v': 'm4v',
2875 'x-matroska': 'mkv',
2876 'x-mng': 'mng',
2877 'x-mp4-fragmented': 'mp4',
2878 'x-ms-asf': 'asf',
2879 'x-ms-wmv': 'wmv',
2880 'x-msvideo': 'avi',
2881
2882 # application (streaming playlists)
2883 'dash+xml': 'mpd',
2884 'f4m+xml': 'f4m',
2885 'hds+xml': 'f4m',
2886 'vnd.apple.mpegurl': 'm3u8',
2887 'vnd.ms-sstr+xml': 'ism',
2888 'x-mpegurl': 'm3u8',
2889
2890 # audio
2891 'audio/mp4': 'm4a',
2892 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2893 # Using .mp3 as it's the most popular one
2894 'audio/mpeg': 'mp3',
2895 'audio/webm': 'webm',
2896 'audio/x-matroska': 'mka',
2897 'audio/x-mpegurl': 'm3u',
2898 'midi': 'mid',
2899 'ogg': 'ogg',
2900 'wav': 'wav',
2901 'wave': 'wav',
2902 'x-aac': 'aac',
2903 'x-flac': 'flac',
2904 'x-m4a': 'm4a',
2905 'x-realaudio': 'ra',
2906 'x-wav': 'wav',
2907
2908 # image
2909 'avif': 'avif',
2910 'bmp': 'bmp',
2911 'gif': 'gif',
2912 'jpeg': 'jpg',
2913 'png': 'png',
2914 'svg+xml': 'svg',
2915 'tiff': 'tif',
2916 'vnd.wap.wbmp': 'wbmp',
2917 'webp': 'webp',
2918 'x-icon': 'ico',
2919 'x-jng': 'jng',
2920 'x-ms-bmp': 'bmp',
2921
2922 # caption
2923 'filmstrip+json': 'fs',
2924 'smptett+xml': 'tt',
2925 'ttaf+xml': 'dfxp',
2926 'ttml+xml': 'ttml',
2927 'x-ms-sami': 'sami',
2928
2929 # misc
2930 'gzip': 'gz',
2931 'json': 'json',
2932 'xml': 'xml',
2933 'zip': 'zip',
2934 }
2935
2936 mimetype = mt.partition(';')[0].strip().lower()
2937 _, _, subtype = mimetype.rpartition('/')
2938
2939 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2940 if ext:
2941 return ext
2942 elif default is not NO_DEFAULT:
2943 return default
2944 return subtype.replace('+', '.')
2945
2946
2947 def ext2mimetype(ext_or_url):
2948 if not ext_or_url:
2949 return None
2950 if '.' not in ext_or_url:
2951 ext_or_url = f'file.{ext_or_url}'
2952 return mimetypes.guess_type(ext_or_url)[0]
2953
2954
2955 def parse_codecs(codecs_str):
2956 # http://tools.ietf.org/html/rfc6381
2957 if not codecs_str:
2958 return {}
2959 split_codecs = list(filter(None, map(
2960 str.strip, codecs_str.strip().strip(',').split(','))))
2961 vcodec, acodec, scodec, hdr = None, None, None, None
2962 for full_codec in split_codecs:
2963 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2964 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2965 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2966 if vcodec:
2967 continue
2968 vcodec = full_codec
2969 if parts[0] in ('dvh1', 'dvhe'):
2970 hdr = 'DV'
2971 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2972 hdr = 'HDR10'
2973 elif parts[:2] == ['vp9', '2']:
2974 hdr = 'HDR10'
2975 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2976 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2977 acodec = acodec or full_codec
2978 elif parts[0] in ('stpp', 'wvtt'):
2979 scodec = scodec or full_codec
2980 else:
2981 write_string(f'WARNING: Unknown codec {full_codec}\n')
2982 if vcodec or acodec or scodec:
2983 return {
2984 'vcodec': vcodec or 'none',
2985 'acodec': acodec or 'none',
2986 'dynamic_range': hdr,
2987 **({'scodec': scodec} if scodec is not None else {}),
2988 }
2989 elif len(split_codecs) == 2:
2990 return {
2991 'vcodec': split_codecs[0],
2992 'acodec': split_codecs[1],
2993 }
2994 return {}
2995
2996
2997 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2998 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2999
3000 allow_mkv = not preferences or 'mkv' in preferences
3001
3002 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3003 return 'mkv' # TODO: any other format allows this?
3004
3005 # TODO: All codecs supported by parse_codecs isn't handled here
3006 COMPATIBLE_CODECS = {
3007 'mp4': {
3008 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3009 'h264', 'aacl', 'ec-3', # Set in ISM
3010 },
3011 'webm': {
3012 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3013 'vp9x', 'vp8x', # in the webm spec
3014 },
3015 }
3016
3017 sanitize_codec = functools.partial(
3018 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3019 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3020
3021 for ext in preferences or COMPATIBLE_CODECS.keys():
3022 codec_set = COMPATIBLE_CODECS.get(ext, set())
3023 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3024 return ext
3025
3026 COMPATIBLE_EXTS = (
3027 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3028 {'webm', 'weba'},
3029 )
3030 for ext in preferences or vexts:
3031 current_exts = {ext, *vexts, *aexts}
3032 if ext == 'mkv' or current_exts == {ext} or any(
3033 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3034 return ext
3035 return 'mkv' if allow_mkv else preferences[-1]
3036
3037
3038 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3039 getheader = url_handle.headers.get
3040
3041 cd = getheader('Content-Disposition')
3042 if cd:
3043 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3044 if m:
3045 e = determine_ext(m.group('filename'), default_ext=None)
3046 if e:
3047 return e
3048
3049 meta_ext = getheader('x-amz-meta-name')
3050 if meta_ext:
3051 e = meta_ext.rpartition('.')[2]
3052 if e:
3053 return e
3054
3055 return mimetype2ext(getheader('Content-Type'), default=default)
3056
3057
3058 def encode_data_uri(data, mime_type):
3059 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3060
3061
3062 def age_restricted(content_limit, age_limit):
3063 """ Returns True iff the content should be blocked """
3064
3065 if age_limit is None: # No limit set
3066 return False
3067 if content_limit is None:
3068 return False # Content available for everyone
3069 return age_limit < content_limit
3070
3071
3072 # List of known byte-order-marks (BOM)
3073 BOMS = [
3074 (b'\xef\xbb\xbf', 'utf-8'),
3075 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3076 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3077 (b'\xff\xfe', 'utf-16-le'),
3078 (b'\xfe\xff', 'utf-16-be'),
3079 ]
3080
3081
3082 def is_html(first_bytes):
3083 """ Detect whether a file contains HTML by examining its first bytes. """
3084
3085 encoding = 'utf-8'
3086 for bom, enc in BOMS:
3087 while first_bytes.startswith(bom):
3088 encoding, first_bytes = enc, first_bytes[len(bom):]
3089
3090 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3091
3092
3093 def determine_protocol(info_dict):
3094 protocol = info_dict.get('protocol')
3095 if protocol is not None:
3096 return protocol
3097
3098 url = sanitize_url(info_dict['url'])
3099 if url.startswith('rtmp'):
3100 return 'rtmp'
3101 elif url.startswith('mms'):
3102 return 'mms'
3103 elif url.startswith('rtsp'):
3104 return 'rtsp'
3105
3106 ext = determine_ext(url)
3107 if ext == 'm3u8':
3108 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3109 elif ext == 'f4m':
3110 return 'f4m'
3111
3112 return urllib.parse.urlparse(url).scheme
3113
3114
3115 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3116 """ Render a list of rows, each as a list of values.
3117 Text after a \t will be right aligned """
3118 def width(string):
3119 return len(remove_terminal_sequences(string).replace('\t', ''))
3120
3121 def get_max_lens(table):
3122 return [max(width(str(v)) for v in col) for col in zip(*table)]
3123
3124 def filter_using_list(row, filterArray):
3125 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3126
3127 max_lens = get_max_lens(data) if hide_empty else []
3128 header_row = filter_using_list(header_row, max_lens)
3129 data = [filter_using_list(row, max_lens) for row in data]
3130
3131 table = [header_row] + data
3132 max_lens = get_max_lens(table)
3133 extra_gap += 1
3134 if delim:
3135 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3136 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3137 for row in table:
3138 for pos, text in enumerate(map(str, row)):
3139 if '\t' in text:
3140 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3141 else:
3142 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3143 ret = '\n'.join(''.join(row).rstrip() for row in table)
3144 return ret
3145
3146
3147 def _match_one(filter_part, dct, incomplete):
3148 # TODO: Generalize code with YoutubeDL._build_format_filter
3149 STRING_OPERATORS = {
3150 '*=': operator.contains,
3151 '^=': lambda attr, value: attr.startswith(value),
3152 '$=': lambda attr, value: attr.endswith(value),
3153 '~=': lambda attr, value: re.search(value, attr),
3154 }
3155 COMPARISON_OPERATORS = {
3156 **STRING_OPERATORS,
3157 '<=': operator.le, # "<=" must be defined above "<"
3158 '<': operator.lt,
3159 '>=': operator.ge,
3160 '>': operator.gt,
3161 '=': operator.eq,
3162 }
3163
3164 if isinstance(incomplete, bool):
3165 is_incomplete = lambda _: incomplete
3166 else:
3167 is_incomplete = lambda k: k in incomplete
3168
3169 operator_rex = re.compile(r'''(?x)
3170 (?P<key>[a-z_]+)
3171 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3172 (?:
3173 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3174 (?P<strval>.+?)
3175 )
3176 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3177 m = operator_rex.fullmatch(filter_part.strip())
3178 if m:
3179 m = m.groupdict()
3180 unnegated_op = COMPARISON_OPERATORS[m['op']]
3181 if m['negation']:
3182 op = lambda attr, value: not unnegated_op(attr, value)
3183 else:
3184 op = unnegated_op
3185 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3186 if m['quote']:
3187 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3188 actual_value = dct.get(m['key'])
3189 numeric_comparison = None
3190 if isinstance(actual_value, (int, float)):
3191 # If the original field is a string and matching comparisonvalue is
3192 # a number we should respect the origin of the original field
3193 # and process comparison value as a string (see
3194 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3195 try:
3196 numeric_comparison = int(comparison_value)
3197 except ValueError:
3198 numeric_comparison = parse_filesize(comparison_value)
3199 if numeric_comparison is None:
3200 numeric_comparison = parse_filesize(f'{comparison_value}B')
3201 if numeric_comparison is None:
3202 numeric_comparison = parse_duration(comparison_value)
3203 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3204 raise ValueError('Operator %s only supports string values!' % m['op'])
3205 if actual_value is None:
3206 return is_incomplete(m['key']) or m['none_inclusive']
3207 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3208
3209 UNARY_OPERATORS = {
3210 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3211 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3212 }
3213 operator_rex = re.compile(r'''(?x)
3214 (?P<op>%s)\s*(?P<key>[a-z_]+)
3215 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3216 m = operator_rex.fullmatch(filter_part.strip())
3217 if m:
3218 op = UNARY_OPERATORS[m.group('op')]
3219 actual_value = dct.get(m.group('key'))
3220 if is_incomplete(m.group('key')) and actual_value is None:
3221 return True
3222 return op(actual_value)
3223
3224 raise ValueError('Invalid filter part %r' % filter_part)
3225
3226
3227 def match_str(filter_str, dct, incomplete=False):
3228 """ Filter a dictionary with a simple string syntax.
3229 @returns Whether the filter passes
3230 @param incomplete Set of keys that is expected to be missing from dct.
3231 Can be True/False to indicate all/none of the keys may be missing.
3232 All conditions on incomplete keys pass if the key is missing
3233 """
3234 return all(
3235 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3236 for filter_part in re.split(r'(?<!\\)&', filter_str))
3237
3238
3239 def match_filter_func(filters, breaking_filters=None):
3240 if not filters and not breaking_filters:
3241 return None
3242 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3243 filters = set(variadic(filters or []))
3244
3245 interactive = '-' in filters
3246 if interactive:
3247 filters.remove('-')
3248
3249 def _match_func(info_dict, incomplete=False):
3250 ret = breaking_filters(info_dict, incomplete)
3251 if ret is not None:
3252 raise RejectedVideoReached(ret)
3253
3254 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3255 return NO_DEFAULT if interactive and not incomplete else None
3256 else:
3257 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3258 filter_str = ') | ('.join(map(str.strip, filters))
3259 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3260 return _match_func
3261
3262
3263 class download_range_func:
3264 def __init__(self, chapters, ranges, from_info=False):
3265 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3266
3267 def __call__(self, info_dict, ydl):
3268
3269 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3270 else 'Cannot match chapters since chapter information is unavailable')
3271 for regex in self.chapters or []:
3272 for i, chapter in enumerate(info_dict.get('chapters') or []):
3273 if re.search(regex, chapter['title']):
3274 warning = None
3275 yield {**chapter, 'index': i}
3276 if self.chapters and warning:
3277 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3278
3279 for start, end in self.ranges or []:
3280 yield {
3281 'start_time': self._handle_negative_timestamp(start, info_dict),
3282 'end_time': self._handle_negative_timestamp(end, info_dict),
3283 }
3284
3285 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3286 yield {
3287 'start_time': info_dict.get('start_time') or 0,
3288 'end_time': info_dict.get('end_time') or float('inf'),
3289 }
3290 elif not self.ranges and not self.chapters:
3291 yield {}
3292
3293 @staticmethod
3294 def _handle_negative_timestamp(time, info):
3295 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3296
3297 def __eq__(self, other):
3298 return (isinstance(other, download_range_func)
3299 and self.chapters == other.chapters and self.ranges == other.ranges)
3300
3301 def __repr__(self):
3302 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3303
3304
3305 def parse_dfxp_time_expr(time_expr):
3306 if not time_expr:
3307 return
3308
3309 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3310 if mobj:
3311 return float(mobj.group('time_offset'))
3312
3313 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3314 if mobj:
3315 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3316
3317
3318 def srt_subtitles_timecode(seconds):
3319 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3320
3321
3322 def ass_subtitles_timecode(seconds):
3323 time = timetuple_from_msec(seconds * 1000)
3324 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3325
3326
3327 def dfxp2srt(dfxp_data):
3328 '''
3329 @param dfxp_data A bytes-like object containing DFXP data
3330 @returns A unicode object containing converted SRT data
3331 '''
3332 LEGACY_NAMESPACES = (
3333 (b'http://www.w3.org/ns/ttml', [
3334 b'http://www.w3.org/2004/11/ttaf1',
3335 b'http://www.w3.org/2006/04/ttaf1',
3336 b'http://www.w3.org/2006/10/ttaf1',
3337 ]),
3338 (b'http://www.w3.org/ns/ttml#styling', [
3339 b'http://www.w3.org/ns/ttml#style',
3340 ]),
3341 )
3342
3343 SUPPORTED_STYLING = [
3344 'color',
3345 'fontFamily',
3346 'fontSize',
3347 'fontStyle',
3348 'fontWeight',
3349 'textDecoration'
3350 ]
3351
3352 _x = functools.partial(xpath_with_ns, ns_map={
3353 'xml': 'http://www.w3.org/XML/1998/namespace',
3354 'ttml': 'http://www.w3.org/ns/ttml',
3355 'tts': 'http://www.w3.org/ns/ttml#styling',
3356 })
3357
3358 styles = {}
3359 default_style = {}
3360
3361 class TTMLPElementParser:
3362 _out = ''
3363 _unclosed_elements = []
3364 _applied_styles = []
3365
3366 def start(self, tag, attrib):
3367 if tag in (_x('ttml:br'), 'br'):
3368 self._out += '\n'
3369 else:
3370 unclosed_elements = []
3371 style = {}
3372 element_style_id = attrib.get('style')
3373 if default_style:
3374 style.update(default_style)
3375 if element_style_id:
3376 style.update(styles.get(element_style_id, {}))
3377 for prop in SUPPORTED_STYLING:
3378 prop_val = attrib.get(_x('tts:' + prop))
3379 if prop_val:
3380 style[prop] = prop_val
3381 if style:
3382 font = ''
3383 for k, v in sorted(style.items()):
3384 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3385 continue
3386 if k == 'color':
3387 font += ' color="%s"' % v
3388 elif k == 'fontSize':
3389 font += ' size="%s"' % v
3390 elif k == 'fontFamily':
3391 font += ' face="%s"' % v
3392 elif k == 'fontWeight' and v == 'bold':
3393 self._out += '<b>'
3394 unclosed_elements.append('b')
3395 elif k == 'fontStyle' and v == 'italic':
3396 self._out += '<i>'
3397 unclosed_elements.append('i')
3398 elif k == 'textDecoration' and v == 'underline':
3399 self._out += '<u>'
3400 unclosed_elements.append('u')
3401 if font:
3402 self._out += '<font' + font + '>'
3403 unclosed_elements.append('font')
3404 applied_style = {}
3405 if self._applied_styles:
3406 applied_style.update(self._applied_styles[-1])
3407 applied_style.update(style)
3408 self._applied_styles.append(applied_style)
3409 self._unclosed_elements.append(unclosed_elements)
3410
3411 def end(self, tag):
3412 if tag not in (_x('ttml:br'), 'br'):
3413 unclosed_elements = self._unclosed_elements.pop()
3414 for element in reversed(unclosed_elements):
3415 self._out += '</%s>' % element
3416 if unclosed_elements and self._applied_styles:
3417 self._applied_styles.pop()
3418
3419 def data(self, data):
3420 self._out += data
3421
3422 def close(self):
3423 return self._out.strip()
3424
3425 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3426 # This will not trigger false positives since only UTF-8 text is being replaced
3427 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3428
3429 def parse_node(node):
3430 target = TTMLPElementParser()
3431 parser = xml.etree.ElementTree.XMLParser(target=target)
3432 parser.feed(xml.etree.ElementTree.tostring(node))
3433 return parser.close()
3434
3435 for k, v in LEGACY_NAMESPACES:
3436 for ns in v:
3437 dfxp_data = dfxp_data.replace(ns, k)
3438
3439 dfxp = compat_etree_fromstring(dfxp_data)
3440 out = []
3441 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3442
3443 if not paras:
3444 raise ValueError('Invalid dfxp/TTML subtitle')
3445
3446 repeat = False
3447 while True:
3448 for style in dfxp.findall(_x('.//ttml:style')):
3449 style_id = style.get('id') or style.get(_x('xml:id'))
3450 if not style_id:
3451 continue
3452 parent_style_id = style.get('style')
3453 if parent_style_id:
3454 if parent_style_id not in styles:
3455 repeat = True
3456 continue
3457 styles[style_id] = styles[parent_style_id].copy()
3458 for prop in SUPPORTED_STYLING:
3459 prop_val = style.get(_x('tts:' + prop))
3460 if prop_val:
3461 styles.setdefault(style_id, {})[prop] = prop_val
3462 if repeat:
3463 repeat = False
3464 else:
3465 break
3466
3467 for p in ('body', 'div'):
3468 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3469 if ele is None:
3470 continue
3471 style = styles.get(ele.get('style'))
3472 if not style:
3473 continue
3474 default_style.update(style)
3475
3476 for para, index in zip(paras, itertools.count(1)):
3477 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3478 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3479 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3480 if begin_time is None:
3481 continue
3482 if not end_time:
3483 if not dur:
3484 continue
3485 end_time = begin_time + dur
3486 out.append('%d\n%s --> %s\n%s\n\n' % (
3487 index,
3488 srt_subtitles_timecode(begin_time),
3489 srt_subtitles_timecode(end_time),
3490 parse_node(para)))
3491
3492 return ''.join(out)
3493
3494
3495 def cli_option(params, command_option, param, separator=None):
3496 param = params.get(param)
3497 return ([] if param is None
3498 else [command_option, str(param)] if separator is None
3499 else [f'{command_option}{separator}{param}'])
3500
3501
3502 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3503 param = params.get(param)
3504 assert param in (True, False, None)
3505 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3506
3507
3508 def cli_valueless_option(params, command_option, param, expected_value=True):
3509 return [command_option] if params.get(param) == expected_value else []
3510
3511
3512 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3513 if isinstance(argdict, (list, tuple)): # for backward compatibility
3514 if use_compat:
3515 return argdict
3516 else:
3517 argdict = None
3518 if argdict is None:
3519 return default
3520 assert isinstance(argdict, dict)
3521
3522 assert isinstance(keys, (list, tuple))
3523 for key_list in keys:
3524 arg_list = list(filter(
3525 lambda x: x is not None,
3526 [argdict.get(key.lower()) for key in variadic(key_list)]))
3527 if arg_list:
3528 return [arg for args in arg_list for arg in args]
3529 return default
3530
3531
3532 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3533 main_key, exe = main_key.lower(), exe.lower()
3534 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3535 keys = [f'{root_key}{k}' for k in (keys or [''])]
3536 if root_key in keys:
3537 if main_key != exe:
3538 keys.append((main_key, exe))
3539 keys.append('default')
3540 else:
3541 use_compat = False
3542 return cli_configuration_args(argdict, keys, default, use_compat)
3543
3544
3545 class ISO639Utils:
3546 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3547 _lang_map = {
3548 'aa': 'aar',
3549 'ab': 'abk',
3550 'ae': 'ave',
3551 'af': 'afr',
3552 'ak': 'aka',
3553 'am': 'amh',
3554 'an': 'arg',
3555 'ar': 'ara',
3556 'as': 'asm',
3557 'av': 'ava',
3558 'ay': 'aym',
3559 'az': 'aze',
3560 'ba': 'bak',
3561 'be': 'bel',
3562 'bg': 'bul',
3563 'bh': 'bih',
3564 'bi': 'bis',
3565 'bm': 'bam',
3566 'bn': 'ben',
3567 'bo': 'bod',
3568 'br': 'bre',
3569 'bs': 'bos',
3570 'ca': 'cat',
3571 'ce': 'che',
3572 'ch': 'cha',
3573 'co': 'cos',
3574 'cr': 'cre',
3575 'cs': 'ces',
3576 'cu': 'chu',
3577 'cv': 'chv',
3578 'cy': 'cym',
3579 'da': 'dan',
3580 'de': 'deu',
3581 'dv': 'div',
3582 'dz': 'dzo',
3583 'ee': 'ewe',
3584 'el': 'ell',
3585 'en': 'eng',
3586 'eo': 'epo',
3587 'es': 'spa',
3588 'et': 'est',
3589 'eu': 'eus',
3590 'fa': 'fas',
3591 'ff': 'ful',
3592 'fi': 'fin',
3593 'fj': 'fij',
3594 'fo': 'fao',
3595 'fr': 'fra',
3596 'fy': 'fry',
3597 'ga': 'gle',
3598 'gd': 'gla',
3599 'gl': 'glg',
3600 'gn': 'grn',
3601 'gu': 'guj',
3602 'gv': 'glv',
3603 'ha': 'hau',
3604 'he': 'heb',
3605 'iw': 'heb', # Replaced by he in 1989 revision
3606 'hi': 'hin',
3607 'ho': 'hmo',
3608 'hr': 'hrv',
3609 'ht': 'hat',
3610 'hu': 'hun',
3611 'hy': 'hye',
3612 'hz': 'her',
3613 'ia': 'ina',
3614 'id': 'ind',
3615 'in': 'ind', # Replaced by id in 1989 revision
3616 'ie': 'ile',
3617 'ig': 'ibo',
3618 'ii': 'iii',
3619 'ik': 'ipk',
3620 'io': 'ido',
3621 'is': 'isl',
3622 'it': 'ita',
3623 'iu': 'iku',
3624 'ja': 'jpn',
3625 'jv': 'jav',
3626 'ka': 'kat',
3627 'kg': 'kon',
3628 'ki': 'kik',
3629 'kj': 'kua',
3630 'kk': 'kaz',
3631 'kl': 'kal',
3632 'km': 'khm',
3633 'kn': 'kan',
3634 'ko': 'kor',
3635 'kr': 'kau',
3636 'ks': 'kas',
3637 'ku': 'kur',
3638 'kv': 'kom',
3639 'kw': 'cor',
3640 'ky': 'kir',
3641 'la': 'lat',
3642 'lb': 'ltz',
3643 'lg': 'lug',
3644 'li': 'lim',
3645 'ln': 'lin',
3646 'lo': 'lao',
3647 'lt': 'lit',
3648 'lu': 'lub',
3649 'lv': 'lav',
3650 'mg': 'mlg',
3651 'mh': 'mah',
3652 'mi': 'mri',
3653 'mk': 'mkd',
3654 'ml': 'mal',
3655 'mn': 'mon',
3656 'mr': 'mar',
3657 'ms': 'msa',
3658 'mt': 'mlt',
3659 'my': 'mya',
3660 'na': 'nau',
3661 'nb': 'nob',
3662 'nd': 'nde',
3663 'ne': 'nep',
3664 'ng': 'ndo',
3665 'nl': 'nld',
3666 'nn': 'nno',
3667 'no': 'nor',
3668 'nr': 'nbl',
3669 'nv': 'nav',
3670 'ny': 'nya',
3671 'oc': 'oci',
3672 'oj': 'oji',
3673 'om': 'orm',
3674 'or': 'ori',
3675 'os': 'oss',
3676 'pa': 'pan',
3677 'pe': 'per',
3678 'pi': 'pli',
3679 'pl': 'pol',
3680 'ps': 'pus',
3681 'pt': 'por',
3682 'qu': 'que',
3683 'rm': 'roh',
3684 'rn': 'run',
3685 'ro': 'ron',
3686 'ru': 'rus',
3687 'rw': 'kin',
3688 'sa': 'san',
3689 'sc': 'srd',
3690 'sd': 'snd',
3691 'se': 'sme',
3692 'sg': 'sag',
3693 'si': 'sin',
3694 'sk': 'slk',
3695 'sl': 'slv',
3696 'sm': 'smo',
3697 'sn': 'sna',
3698 'so': 'som',
3699 'sq': 'sqi',
3700 'sr': 'srp',
3701 'ss': 'ssw',
3702 'st': 'sot',
3703 'su': 'sun',
3704 'sv': 'swe',
3705 'sw': 'swa',
3706 'ta': 'tam',
3707 'te': 'tel',
3708 'tg': 'tgk',
3709 'th': 'tha',
3710 'ti': 'tir',
3711 'tk': 'tuk',
3712 'tl': 'tgl',
3713 'tn': 'tsn',
3714 'to': 'ton',
3715 'tr': 'tur',
3716 'ts': 'tso',
3717 'tt': 'tat',
3718 'tw': 'twi',
3719 'ty': 'tah',
3720 'ug': 'uig',
3721 'uk': 'ukr',
3722 'ur': 'urd',
3723 'uz': 'uzb',
3724 've': 'ven',
3725 'vi': 'vie',
3726 'vo': 'vol',
3727 'wa': 'wln',
3728 'wo': 'wol',
3729 'xh': 'xho',
3730 'yi': 'yid',
3731 'ji': 'yid', # Replaced by yi in 1989 revision
3732 'yo': 'yor',
3733 'za': 'zha',
3734 'zh': 'zho',
3735 'zu': 'zul',
3736 }
3737
3738 @classmethod
3739 def short2long(cls, code):
3740 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3741 return cls._lang_map.get(code[:2])
3742
3743 @classmethod
3744 def long2short(cls, code):
3745 """Convert language code from ISO 639-2/T to ISO 639-1"""
3746 for short_name, long_name in cls._lang_map.items():
3747 if long_name == code:
3748 return short_name
3749
3750
3751 class ISO3166Utils:
3752 # From http://data.okfn.org/data/core/country-list
3753 _country_map = {
3754 'AF': 'Afghanistan',
3755 'AX': 'Åland Islands',
3756 'AL': 'Albania',
3757 'DZ': 'Algeria',
3758 'AS': 'American Samoa',
3759 'AD': 'Andorra',
3760 'AO': 'Angola',
3761 'AI': 'Anguilla',
3762 'AQ': 'Antarctica',
3763 'AG': 'Antigua and Barbuda',
3764 'AR': 'Argentina',
3765 'AM': 'Armenia',
3766 'AW': 'Aruba',
3767 'AU': 'Australia',
3768 'AT': 'Austria',
3769 'AZ': 'Azerbaijan',
3770 'BS': 'Bahamas',
3771 'BH': 'Bahrain',
3772 'BD': 'Bangladesh',
3773 'BB': 'Barbados',
3774 'BY': 'Belarus',
3775 'BE': 'Belgium',
3776 'BZ': 'Belize',
3777 'BJ': 'Benin',
3778 'BM': 'Bermuda',
3779 'BT': 'Bhutan',
3780 'BO': 'Bolivia, Plurinational State of',
3781 'BQ': 'Bonaire, Sint Eustatius and Saba',
3782 'BA': 'Bosnia and Herzegovina',
3783 'BW': 'Botswana',
3784 'BV': 'Bouvet Island',
3785 'BR': 'Brazil',
3786 'IO': 'British Indian Ocean Territory',
3787 'BN': 'Brunei Darussalam',
3788 'BG': 'Bulgaria',
3789 'BF': 'Burkina Faso',
3790 'BI': 'Burundi',
3791 'KH': 'Cambodia',
3792 'CM': 'Cameroon',
3793 'CA': 'Canada',
3794 'CV': 'Cape Verde',
3795 'KY': 'Cayman Islands',
3796 'CF': 'Central African Republic',
3797 'TD': 'Chad',
3798 'CL': 'Chile',
3799 'CN': 'China',
3800 'CX': 'Christmas Island',
3801 'CC': 'Cocos (Keeling) Islands',
3802 'CO': 'Colombia',
3803 'KM': 'Comoros',
3804 'CG': 'Congo',
3805 'CD': 'Congo, the Democratic Republic of the',
3806 'CK': 'Cook Islands',
3807 'CR': 'Costa Rica',
3808 'CI': 'Côte d\'Ivoire',
3809 'HR': 'Croatia',
3810 'CU': 'Cuba',
3811 'CW': 'Curaçao',
3812 'CY': 'Cyprus',
3813 'CZ': 'Czech Republic',
3814 'DK': 'Denmark',
3815 'DJ': 'Djibouti',
3816 'DM': 'Dominica',
3817 'DO': 'Dominican Republic',
3818 'EC': 'Ecuador',
3819 'EG': 'Egypt',
3820 'SV': 'El Salvador',
3821 'GQ': 'Equatorial Guinea',
3822 'ER': 'Eritrea',
3823 'EE': 'Estonia',
3824 'ET': 'Ethiopia',
3825 'FK': 'Falkland Islands (Malvinas)',
3826 'FO': 'Faroe Islands',
3827 'FJ': 'Fiji',
3828 'FI': 'Finland',
3829 'FR': 'France',
3830 'GF': 'French Guiana',
3831 'PF': 'French Polynesia',
3832 'TF': 'French Southern Territories',
3833 'GA': 'Gabon',
3834 'GM': 'Gambia',
3835 'GE': 'Georgia',
3836 'DE': 'Germany',
3837 'GH': 'Ghana',
3838 'GI': 'Gibraltar',
3839 'GR': 'Greece',
3840 'GL': 'Greenland',
3841 'GD': 'Grenada',
3842 'GP': 'Guadeloupe',
3843 'GU': 'Guam',
3844 'GT': 'Guatemala',
3845 'GG': 'Guernsey',
3846 'GN': 'Guinea',
3847 'GW': 'Guinea-Bissau',
3848 'GY': 'Guyana',
3849 'HT': 'Haiti',
3850 'HM': 'Heard Island and McDonald Islands',
3851 'VA': 'Holy See (Vatican City State)',
3852 'HN': 'Honduras',
3853 'HK': 'Hong Kong',
3854 'HU': 'Hungary',
3855 'IS': 'Iceland',
3856 'IN': 'India',
3857 'ID': 'Indonesia',
3858 'IR': 'Iran, Islamic Republic of',
3859 'IQ': 'Iraq',
3860 'IE': 'Ireland',
3861 'IM': 'Isle of Man',
3862 'IL': 'Israel',
3863 'IT': 'Italy',
3864 'JM': 'Jamaica',
3865 'JP': 'Japan',
3866 'JE': 'Jersey',
3867 'JO': 'Jordan',
3868 'KZ': 'Kazakhstan',
3869 'KE': 'Kenya',
3870 'KI': 'Kiribati',
3871 'KP': 'Korea, Democratic People\'s Republic of',
3872 'KR': 'Korea, Republic of',
3873 'KW': 'Kuwait',
3874 'KG': 'Kyrgyzstan',
3875 'LA': 'Lao People\'s Democratic Republic',
3876 'LV': 'Latvia',
3877 'LB': 'Lebanon',
3878 'LS': 'Lesotho',
3879 'LR': 'Liberia',
3880 'LY': 'Libya',
3881 'LI': 'Liechtenstein',
3882 'LT': 'Lithuania',
3883 'LU': 'Luxembourg',
3884 'MO': 'Macao',
3885 'MK': 'Macedonia, the Former Yugoslav Republic of',
3886 'MG': 'Madagascar',
3887 'MW': 'Malawi',
3888 'MY': 'Malaysia',
3889 'MV': 'Maldives',
3890 'ML': 'Mali',
3891 'MT': 'Malta',
3892 'MH': 'Marshall Islands',
3893 'MQ': 'Martinique',
3894 'MR': 'Mauritania',
3895 'MU': 'Mauritius',
3896 'YT': 'Mayotte',
3897 'MX': 'Mexico',
3898 'FM': 'Micronesia, Federated States of',
3899 'MD': 'Moldova, Republic of',
3900 'MC': 'Monaco',
3901 'MN': 'Mongolia',
3902 'ME': 'Montenegro',
3903 'MS': 'Montserrat',
3904 'MA': 'Morocco',
3905 'MZ': 'Mozambique',
3906 'MM': 'Myanmar',
3907 'NA': 'Namibia',
3908 'NR': 'Nauru',
3909 'NP': 'Nepal',
3910 'NL': 'Netherlands',
3911 'NC': 'New Caledonia',
3912 'NZ': 'New Zealand',
3913 'NI': 'Nicaragua',
3914 'NE': 'Niger',
3915 'NG': 'Nigeria',
3916 'NU': 'Niue',
3917 'NF': 'Norfolk Island',
3918 'MP': 'Northern Mariana Islands',
3919 'NO': 'Norway',
3920 'OM': 'Oman',
3921 'PK': 'Pakistan',
3922 'PW': 'Palau',
3923 'PS': 'Palestine, State of',
3924 'PA': 'Panama',
3925 'PG': 'Papua New Guinea',
3926 'PY': 'Paraguay',
3927 'PE': 'Peru',
3928 'PH': 'Philippines',
3929 'PN': 'Pitcairn',
3930 'PL': 'Poland',
3931 'PT': 'Portugal',
3932 'PR': 'Puerto Rico',
3933 'QA': 'Qatar',
3934 'RE': 'Réunion',
3935 'RO': 'Romania',
3936 'RU': 'Russian Federation',
3937 'RW': 'Rwanda',
3938 'BL': 'Saint Barthélemy',
3939 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3940 'KN': 'Saint Kitts and Nevis',
3941 'LC': 'Saint Lucia',
3942 'MF': 'Saint Martin (French part)',
3943 'PM': 'Saint Pierre and Miquelon',
3944 'VC': 'Saint Vincent and the Grenadines',
3945 'WS': 'Samoa',
3946 'SM': 'San Marino',
3947 'ST': 'Sao Tome and Principe',
3948 'SA': 'Saudi Arabia',
3949 'SN': 'Senegal',
3950 'RS': 'Serbia',
3951 'SC': 'Seychelles',
3952 'SL': 'Sierra Leone',
3953 'SG': 'Singapore',
3954 'SX': 'Sint Maarten (Dutch part)',
3955 'SK': 'Slovakia',
3956 'SI': 'Slovenia',
3957 'SB': 'Solomon Islands',
3958 'SO': 'Somalia',
3959 'ZA': 'South Africa',
3960 'GS': 'South Georgia and the South Sandwich Islands',
3961 'SS': 'South Sudan',
3962 'ES': 'Spain',
3963 'LK': 'Sri Lanka',
3964 'SD': 'Sudan',
3965 'SR': 'Suriname',
3966 'SJ': 'Svalbard and Jan Mayen',
3967 'SZ': 'Swaziland',
3968 'SE': 'Sweden',
3969 'CH': 'Switzerland',
3970 'SY': 'Syrian Arab Republic',
3971 'TW': 'Taiwan, Province of China',
3972 'TJ': 'Tajikistan',
3973 'TZ': 'Tanzania, United Republic of',
3974 'TH': 'Thailand',
3975 'TL': 'Timor-Leste',
3976 'TG': 'Togo',
3977 'TK': 'Tokelau',
3978 'TO': 'Tonga',
3979 'TT': 'Trinidad and Tobago',
3980 'TN': 'Tunisia',
3981 'TR': 'Turkey',
3982 'TM': 'Turkmenistan',
3983 'TC': 'Turks and Caicos Islands',
3984 'TV': 'Tuvalu',
3985 'UG': 'Uganda',
3986 'UA': 'Ukraine',
3987 'AE': 'United Arab Emirates',
3988 'GB': 'United Kingdom',
3989 'US': 'United States',
3990 'UM': 'United States Minor Outlying Islands',
3991 'UY': 'Uruguay',
3992 'UZ': 'Uzbekistan',
3993 'VU': 'Vanuatu',
3994 'VE': 'Venezuela, Bolivarian Republic of',
3995 'VN': 'Viet Nam',
3996 'VG': 'Virgin Islands, British',
3997 'VI': 'Virgin Islands, U.S.',
3998 'WF': 'Wallis and Futuna',
3999 'EH': 'Western Sahara',
4000 'YE': 'Yemen',
4001 'ZM': 'Zambia',
4002 'ZW': 'Zimbabwe',
4003 # Not ISO 3166 codes, but used for IP blocks
4004 'AP': 'Asia/Pacific Region',
4005 'EU': 'Europe',
4006 }
4007
4008 @classmethod
4009 def short2full(cls, code):
4010 """Convert an ISO 3166-2 country code to the corresponding full name"""
4011 return cls._country_map.get(code.upper())
4012
4013
4014 class GeoUtils:
4015 # Major IPv4 address blocks per country
4016 _country_ip_map = {
4017 'AD': '46.172.224.0/19',
4018 'AE': '94.200.0.0/13',
4019 'AF': '149.54.0.0/17',
4020 'AG': '209.59.64.0/18',
4021 'AI': '204.14.248.0/21',
4022 'AL': '46.99.0.0/16',
4023 'AM': '46.70.0.0/15',
4024 'AO': '105.168.0.0/13',
4025 'AP': '182.50.184.0/21',
4026 'AQ': '23.154.160.0/24',
4027 'AR': '181.0.0.0/12',
4028 'AS': '202.70.112.0/20',
4029 'AT': '77.116.0.0/14',
4030 'AU': '1.128.0.0/11',
4031 'AW': '181.41.0.0/18',
4032 'AX': '185.217.4.0/22',
4033 'AZ': '5.197.0.0/16',
4034 'BA': '31.176.128.0/17',
4035 'BB': '65.48.128.0/17',
4036 'BD': '114.130.0.0/16',
4037 'BE': '57.0.0.0/8',
4038 'BF': '102.178.0.0/15',
4039 'BG': '95.42.0.0/15',
4040 'BH': '37.131.0.0/17',
4041 'BI': '154.117.192.0/18',
4042 'BJ': '137.255.0.0/16',
4043 'BL': '185.212.72.0/23',
4044 'BM': '196.12.64.0/18',
4045 'BN': '156.31.0.0/16',
4046 'BO': '161.56.0.0/16',
4047 'BQ': '161.0.80.0/20',
4048 'BR': '191.128.0.0/12',
4049 'BS': '24.51.64.0/18',
4050 'BT': '119.2.96.0/19',
4051 'BW': '168.167.0.0/16',
4052 'BY': '178.120.0.0/13',
4053 'BZ': '179.42.192.0/18',
4054 'CA': '99.224.0.0/11',
4055 'CD': '41.243.0.0/16',
4056 'CF': '197.242.176.0/21',
4057 'CG': '160.113.0.0/16',
4058 'CH': '85.0.0.0/13',
4059 'CI': '102.136.0.0/14',
4060 'CK': '202.65.32.0/19',
4061 'CL': '152.172.0.0/14',
4062 'CM': '102.244.0.0/14',
4063 'CN': '36.128.0.0/10',
4064 'CO': '181.240.0.0/12',
4065 'CR': '201.192.0.0/12',
4066 'CU': '152.206.0.0/15',
4067 'CV': '165.90.96.0/19',
4068 'CW': '190.88.128.0/17',
4069 'CY': '31.153.0.0/16',
4070 'CZ': '88.100.0.0/14',
4071 'DE': '53.0.0.0/8',
4072 'DJ': '197.241.0.0/17',
4073 'DK': '87.48.0.0/12',
4074 'DM': '192.243.48.0/20',
4075 'DO': '152.166.0.0/15',
4076 'DZ': '41.96.0.0/12',
4077 'EC': '186.68.0.0/15',
4078 'EE': '90.190.0.0/15',
4079 'EG': '156.160.0.0/11',
4080 'ER': '196.200.96.0/20',
4081 'ES': '88.0.0.0/11',
4082 'ET': '196.188.0.0/14',
4083 'EU': '2.16.0.0/13',
4084 'FI': '91.152.0.0/13',
4085 'FJ': '144.120.0.0/16',
4086 'FK': '80.73.208.0/21',
4087 'FM': '119.252.112.0/20',
4088 'FO': '88.85.32.0/19',
4089 'FR': '90.0.0.0/9',
4090 'GA': '41.158.0.0/15',
4091 'GB': '25.0.0.0/8',
4092 'GD': '74.122.88.0/21',
4093 'GE': '31.146.0.0/16',
4094 'GF': '161.22.64.0/18',
4095 'GG': '62.68.160.0/19',
4096 'GH': '154.160.0.0/12',
4097 'GI': '95.164.0.0/16',
4098 'GL': '88.83.0.0/19',
4099 'GM': '160.182.0.0/15',
4100 'GN': '197.149.192.0/18',
4101 'GP': '104.250.0.0/19',
4102 'GQ': '105.235.224.0/20',
4103 'GR': '94.64.0.0/13',
4104 'GT': '168.234.0.0/16',
4105 'GU': '168.123.0.0/16',
4106 'GW': '197.214.80.0/20',
4107 'GY': '181.41.64.0/18',
4108 'HK': '113.252.0.0/14',
4109 'HN': '181.210.0.0/16',
4110 'HR': '93.136.0.0/13',
4111 'HT': '148.102.128.0/17',
4112 'HU': '84.0.0.0/14',
4113 'ID': '39.192.0.0/10',
4114 'IE': '87.32.0.0/12',
4115 'IL': '79.176.0.0/13',
4116 'IM': '5.62.80.0/20',
4117 'IN': '117.192.0.0/10',
4118 'IO': '203.83.48.0/21',
4119 'IQ': '37.236.0.0/14',
4120 'IR': '2.176.0.0/12',
4121 'IS': '82.221.0.0/16',
4122 'IT': '79.0.0.0/10',
4123 'JE': '87.244.64.0/18',
4124 'JM': '72.27.0.0/17',
4125 'JO': '176.29.0.0/16',
4126 'JP': '133.0.0.0/8',
4127 'KE': '105.48.0.0/12',
4128 'KG': '158.181.128.0/17',
4129 'KH': '36.37.128.0/17',
4130 'KI': '103.25.140.0/22',
4131 'KM': '197.255.224.0/20',
4132 'KN': '198.167.192.0/19',
4133 'KP': '175.45.176.0/22',
4134 'KR': '175.192.0.0/10',
4135 'KW': '37.36.0.0/14',
4136 'KY': '64.96.0.0/15',
4137 'KZ': '2.72.0.0/13',
4138 'LA': '115.84.64.0/18',
4139 'LB': '178.135.0.0/16',
4140 'LC': '24.92.144.0/20',
4141 'LI': '82.117.0.0/19',
4142 'LK': '112.134.0.0/15',
4143 'LR': '102.183.0.0/16',
4144 'LS': '129.232.0.0/17',
4145 'LT': '78.56.0.0/13',
4146 'LU': '188.42.0.0/16',
4147 'LV': '46.109.0.0/16',
4148 'LY': '41.252.0.0/14',
4149 'MA': '105.128.0.0/11',
4150 'MC': '88.209.64.0/18',
4151 'MD': '37.246.0.0/16',
4152 'ME': '178.175.0.0/17',
4153 'MF': '74.112.232.0/21',
4154 'MG': '154.126.0.0/17',
4155 'MH': '117.103.88.0/21',
4156 'MK': '77.28.0.0/15',
4157 'ML': '154.118.128.0/18',
4158 'MM': '37.111.0.0/17',
4159 'MN': '49.0.128.0/17',
4160 'MO': '60.246.0.0/16',
4161 'MP': '202.88.64.0/20',
4162 'MQ': '109.203.224.0/19',
4163 'MR': '41.188.64.0/18',
4164 'MS': '208.90.112.0/22',
4165 'MT': '46.11.0.0/16',
4166 'MU': '105.16.0.0/12',
4167 'MV': '27.114.128.0/18',
4168 'MW': '102.70.0.0/15',
4169 'MX': '187.192.0.0/11',
4170 'MY': '175.136.0.0/13',
4171 'MZ': '197.218.0.0/15',
4172 'NA': '41.182.0.0/16',
4173 'NC': '101.101.0.0/18',
4174 'NE': '197.214.0.0/18',
4175 'NF': '203.17.240.0/22',
4176 'NG': '105.112.0.0/12',
4177 'NI': '186.76.0.0/15',
4178 'NL': '145.96.0.0/11',
4179 'NO': '84.208.0.0/13',
4180 'NP': '36.252.0.0/15',
4181 'NR': '203.98.224.0/19',
4182 'NU': '49.156.48.0/22',
4183 'NZ': '49.224.0.0/14',
4184 'OM': '5.36.0.0/15',
4185 'PA': '186.72.0.0/15',
4186 'PE': '186.160.0.0/14',
4187 'PF': '123.50.64.0/18',
4188 'PG': '124.240.192.0/19',
4189 'PH': '49.144.0.0/13',
4190 'PK': '39.32.0.0/11',
4191 'PL': '83.0.0.0/11',
4192 'PM': '70.36.0.0/20',
4193 'PR': '66.50.0.0/16',
4194 'PS': '188.161.0.0/16',
4195 'PT': '85.240.0.0/13',
4196 'PW': '202.124.224.0/20',
4197 'PY': '181.120.0.0/14',
4198 'QA': '37.210.0.0/15',
4199 'RE': '102.35.0.0/16',
4200 'RO': '79.112.0.0/13',
4201 'RS': '93.86.0.0/15',
4202 'RU': '5.136.0.0/13',
4203 'RW': '41.186.0.0/16',
4204 'SA': '188.48.0.0/13',
4205 'SB': '202.1.160.0/19',
4206 'SC': '154.192.0.0/11',
4207 'SD': '102.120.0.0/13',
4208 'SE': '78.64.0.0/12',
4209 'SG': '8.128.0.0/10',
4210 'SI': '188.196.0.0/14',
4211 'SK': '78.98.0.0/15',
4212 'SL': '102.143.0.0/17',
4213 'SM': '89.186.32.0/19',
4214 'SN': '41.82.0.0/15',
4215 'SO': '154.115.192.0/18',
4216 'SR': '186.179.128.0/17',
4217 'SS': '105.235.208.0/21',
4218 'ST': '197.159.160.0/19',
4219 'SV': '168.243.0.0/16',
4220 'SX': '190.102.0.0/20',
4221 'SY': '5.0.0.0/16',
4222 'SZ': '41.84.224.0/19',
4223 'TC': '65.255.48.0/20',
4224 'TD': '154.68.128.0/19',
4225 'TG': '196.168.0.0/14',
4226 'TH': '171.96.0.0/13',
4227 'TJ': '85.9.128.0/18',
4228 'TK': '27.96.24.0/21',
4229 'TL': '180.189.160.0/20',
4230 'TM': '95.85.96.0/19',
4231 'TN': '197.0.0.0/11',
4232 'TO': '175.176.144.0/21',
4233 'TR': '78.160.0.0/11',
4234 'TT': '186.44.0.0/15',
4235 'TV': '202.2.96.0/19',
4236 'TW': '120.96.0.0/11',
4237 'TZ': '156.156.0.0/14',
4238 'UA': '37.52.0.0/14',
4239 'UG': '102.80.0.0/13',
4240 'US': '6.0.0.0/8',
4241 'UY': '167.56.0.0/13',
4242 'UZ': '84.54.64.0/18',
4243 'VA': '212.77.0.0/19',
4244 'VC': '207.191.240.0/21',
4245 'VE': '186.88.0.0/13',
4246 'VG': '66.81.192.0/20',
4247 'VI': '146.226.0.0/16',
4248 'VN': '14.160.0.0/11',
4249 'VU': '202.80.32.0/20',
4250 'WF': '117.20.32.0/21',
4251 'WS': '202.4.32.0/19',
4252 'YE': '134.35.0.0/16',
4253 'YT': '41.242.116.0/22',
4254 'ZA': '41.0.0.0/11',
4255 'ZM': '102.144.0.0/13',
4256 'ZW': '102.177.192.0/18',
4257 }
4258
4259 @classmethod
4260 def random_ipv4(cls, code_or_block):
4261 if len(code_or_block) == 2:
4262 block = cls._country_ip_map.get(code_or_block.upper())
4263 if not block:
4264 return None
4265 else:
4266 block = code_or_block
4267 addr, preflen = block.split('/')
4268 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4269 addr_max = addr_min | (0xffffffff >> int(preflen))
4270 return str(socket.inet_ntoa(
4271 struct.pack('!L', random.randint(addr_min, addr_max))))
4272
4273
4274 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4275 # released into Public Domain
4276 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4277
4278 def long_to_bytes(n, blocksize=0):
4279 """long_to_bytes(n:long, blocksize:int) : string
4280 Convert a long integer to a byte string.
4281
4282 If optional blocksize is given and greater than zero, pad the front of the
4283 byte string with binary zeros so that the length is a multiple of
4284 blocksize.
4285 """
4286 # after much testing, this algorithm was deemed to be the fastest
4287 s = b''
4288 n = int(n)
4289 while n > 0:
4290 s = struct.pack('>I', n & 0xffffffff) + s
4291 n = n >> 32
4292 # strip off leading zeros
4293 for i in range(len(s)):
4294 if s[i] != b'\000'[0]:
4295 break
4296 else:
4297 # only happens when n == 0
4298 s = b'\000'
4299 i = 0
4300 s = s[i:]
4301 # add back some pad bytes. this could be done more efficiently w.r.t. the
4302 # de-padding being done above, but sigh...
4303 if blocksize > 0 and len(s) % blocksize:
4304 s = (blocksize - len(s) % blocksize) * b'\000' + s
4305 return s
4306
4307
4308 def bytes_to_long(s):
4309 """bytes_to_long(string) : long
4310 Convert a byte string to a long integer.
4311
4312 This is (essentially) the inverse of long_to_bytes().
4313 """
4314 acc = 0
4315 length = len(s)
4316 if length % 4:
4317 extra = (4 - length % 4)
4318 s = b'\000' * extra + s
4319 length = length + extra
4320 for i in range(0, length, 4):
4321 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4322 return acc
4323
4324
4325 def ohdave_rsa_encrypt(data, exponent, modulus):
4326 '''
4327 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4328
4329 Input:
4330 data: data to encrypt, bytes-like object
4331 exponent, modulus: parameter e and N of RSA algorithm, both integer
4332 Output: hex string of encrypted data
4333
4334 Limitation: supports one block encryption only
4335 '''
4336
4337 payload = int(binascii.hexlify(data[::-1]), 16)
4338 encrypted = pow(payload, exponent, modulus)
4339 return '%x' % encrypted
4340
4341
4342 def pkcs1pad(data, length):
4343 """
4344 Padding input data with PKCS#1 scheme
4345
4346 @param {int[]} data input data
4347 @param {int} length target length
4348 @returns {int[]} padded data
4349 """
4350 if len(data) > length - 11:
4351 raise ValueError('Input data too long for PKCS#1 padding')
4352
4353 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4354 return [0, 2] + pseudo_random + [0] + data
4355
4356
4357 def _base_n_table(n, table):
4358 if not table and not n:
4359 raise ValueError('Either table or n must be specified')
4360 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4361
4362 if n and n != len(table):
4363 raise ValueError(f'base {n} exceeds table length {len(table)}')
4364 return table
4365
4366
4367 def encode_base_n(num, n=None, table=None):
4368 """Convert given int to a base-n string"""
4369 table = _base_n_table(n, table)
4370 if not num:
4371 return table[0]
4372
4373 result, base = '', len(table)
4374 while num:
4375 result = table[num % base] + result
4376 num = num // base
4377 return result
4378
4379
4380 def decode_base_n(string, n=None, table=None):
4381 """Convert given base-n string to int"""
4382 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4383 result, base = 0, len(table)
4384 for char in string:
4385 result = result * base + table[char]
4386 return result
4387
4388
4389 def decode_packed_codes(code):
4390 mobj = re.search(PACKED_CODES_RE, code)
4391 obfuscated_code, base, count, symbols = mobj.groups()
4392 base = int(base)
4393 count = int(count)
4394 symbols = symbols.split('|')
4395 symbol_table = {}
4396
4397 while count:
4398 count -= 1
4399 base_n_count = encode_base_n(count, base)
4400 symbol_table[base_n_count] = symbols[count] or base_n_count
4401
4402 return re.sub(
4403 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4404 obfuscated_code)
4405
4406
4407 def caesar(s, alphabet, shift):
4408 if shift == 0:
4409 return s
4410 l = len(alphabet)
4411 return ''.join(
4412 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4413 for c in s)
4414
4415
4416 def rot47(s):
4417 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4418
4419
4420 def parse_m3u8_attributes(attrib):
4421 info = {}
4422 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4423 if val.startswith('"'):
4424 val = val[1:-1]
4425 info[key] = val
4426 return info
4427
4428
4429 def urshift(val, n):
4430 return val >> n if val >= 0 else (val + 0x100000000) >> n
4431
4432
4433 def write_xattr(path, key, value):
4434 # Windows: Write xattrs to NTFS Alternate Data Streams:
4435 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4436 if compat_os_name == 'nt':
4437 assert ':' not in key
4438 assert os.path.exists(path)
4439
4440 try:
4441 with open(f'{path}:{key}', 'wb') as f:
4442 f.write(value)
4443 except OSError as e:
4444 raise XAttrMetadataError(e.errno, e.strerror)
4445 return
4446
4447 # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4448
4449 setxattr = None
4450 if callable(getattr(os, 'setxattr', None)):
4451 setxattr = os.setxattr
4452 elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4453 # Unicode arguments are not supported in pyxattr until version 0.5.0
4454 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4455 if version_tuple(xattr.__version__) >= (0, 5, 0):
4456 setxattr = xattr.set
4457 elif xattr:
4458 setxattr = xattr.setxattr
4459
4460 if setxattr:
4461 try:
4462 setxattr(path, key, value)
4463 except OSError as e:
4464 raise XAttrMetadataError(e.errno, e.strerror)
4465 return
4466
4467 # UNIX Method 2. Use setfattr/xattr executables
4468 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4469 else 'xattr' if check_executable('xattr', ['-h']) else None)
4470 if not exe:
4471 raise XAttrUnavailableError(
4472 'Couldn\'t find a tool to set the xattrs. Install either the "xattr" or "pyxattr" Python modules or the '
4473 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4474
4475 value = value.decode()
4476 try:
4477 _, stderr, returncode = Popen.run(
4478 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4479 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4480 except OSError as e:
4481 raise XAttrMetadataError(e.errno, e.strerror)
4482 if returncode:
4483 raise XAttrMetadataError(returncode, stderr)
4484
4485
4486 def random_birthday(year_field, month_field, day_field):
4487 start_date = datetime.date(1950, 1, 1)
4488 end_date = datetime.date(1995, 12, 31)
4489 offset = random.randint(0, (end_date - start_date).days)
4490 random_date = start_date + datetime.timedelta(offset)
4491 return {
4492 year_field: str(random_date.year),
4493 month_field: str(random_date.month),
4494 day_field: str(random_date.day),
4495 }
4496
4497
4498 def find_available_port(interface=''):
4499 try:
4500 with socket.socket() as sock:
4501 sock.bind((interface, 0))
4502 return sock.getsockname()[1]
4503 except OSError:
4504 return None
4505
4506
4507 # Templates for internet shortcut files, which are plain text files.
4508 DOT_URL_LINK_TEMPLATE = '''\
4509 [InternetShortcut]
4510 URL=%(url)s
4511 '''
4512
4513 DOT_WEBLOC_LINK_TEMPLATE = '''\
4514 <?xml version="1.0" encoding="UTF-8"?>
4515 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4516 <plist version="1.0">
4517 <dict>
4518 \t<key>URL</key>
4519 \t<string>%(url)s</string>
4520 </dict>
4521 </plist>
4522 '''
4523
4524 DOT_DESKTOP_LINK_TEMPLATE = '''\
4525 [Desktop Entry]
4526 Encoding=UTF-8
4527 Name=%(filename)s
4528 Type=Link
4529 URL=%(url)s
4530 Icon=text-html
4531 '''
4532
4533 LINK_TEMPLATES = {
4534 'url': DOT_URL_LINK_TEMPLATE,
4535 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4536 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4537 }
4538
4539
4540 def iri_to_uri(iri):
4541 """
4542 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4543
4544 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4545 """
4546
4547 iri_parts = urllib.parse.urlparse(iri)
4548
4549 if '[' in iri_parts.netloc:
4550 raise ValueError('IPv6 URIs are not, yet, supported.')
4551 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4552
4553 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4554
4555 net_location = ''
4556 if iri_parts.username:
4557 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4558 if iri_parts.password is not None:
4559 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4560 net_location += '@'
4561
4562 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
4563 # The 'idna' encoding produces ASCII text.
4564 if iri_parts.port is not None and iri_parts.port != 80:
4565 net_location += ':' + str(iri_parts.port)
4566
4567 return urllib.parse.urlunparse(
4568 (iri_parts.scheme,
4569 net_location,
4570
4571 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4572
4573 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4574 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4575
4576 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4577 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4578
4579 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4580
4581 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4582
4583
4584 def to_high_limit_path(path):
4585 if sys.platform in ['win32', 'cygwin']:
4586 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4587 return '\\\\?\\' + os.path.abspath(path)
4588
4589 return path
4590
4591
4592 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4593 val = traversal.traverse_obj(obj, *variadic(field))
4594 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4595 return default
4596 return template % func(val)
4597
4598
4599 def clean_podcast_url(url):
4600 url = re.sub(r'''(?x)
4601 (?:
4602 (?:
4603 chtbl\.com/track|
4604 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4605 play\.podtrac\.com|
4606 chrt\.fm/track|
4607 mgln\.ai/e
4608 )(?:/[^/.]+)?|
4609 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4610 flex\.acast\.com|
4611 pd(?:
4612 cn\.co| # https://podcorn.com/analytics-prefix/
4613 st\.fm # https://podsights.com/docs/
4614 )/e|
4615 [0-9]\.gum\.fm|
4616 pscrb\.fm/rss/p
4617 )/''', '', url)
4618 return re.sub(r'^\w+://(\w+://)', r'\1', url)
4619
4620
4621 _HEX_TABLE = '0123456789abcdef'
4622
4623
4624 def random_uuidv4():
4625 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4626
4627
4628 def make_dir(path, to_screen=None):
4629 try:
4630 dn = os.path.dirname(path)
4631 if dn:
4632 os.makedirs(dn, exist_ok=True)
4633 return True
4634 except OSError as err:
4635 if callable(to_screen) is not None:
4636 to_screen(f'unable to create directory {err}')
4637 return False
4638
4639
4640 def get_executable_path():
4641 from ..update import _get_variant_and_executable_path
4642
4643 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4644
4645
4646 def get_user_config_dirs(package_name):
4647 # .config (e.g. ~/.config/package_name)
4648 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4649 yield os.path.join(xdg_config_home, package_name)
4650
4651 # appdata (%APPDATA%/package_name)
4652 appdata_dir = os.getenv('appdata')
4653 if appdata_dir:
4654 yield os.path.join(appdata_dir, package_name)
4655
4656 # home (~/.package_name)
4657 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4658
4659
4660 def get_system_config_dirs(package_name):
4661 # /etc/package_name
4662 yield os.path.join('/etc', package_name)
4663
4664
4665 def time_seconds(**kwargs):
4666 """
4667 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4668 """
4669 return time.time() + datetime.timedelta(**kwargs).total_seconds()
4670
4671
4672 # create a JSON Web Signature (jws) with HS256 algorithm
4673 # the resulting format is in JWS Compact Serialization
4674 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4675 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4676 def jwt_encode_hs256(payload_data, key, headers={}):
4677 header_data = {
4678 'alg': 'HS256',
4679 'typ': 'JWT',
4680 }
4681 if headers:
4682 header_data.update(headers)
4683 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4684 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4685 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4686 signature_b64 = base64.b64encode(h.digest())
4687 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4688 return token
4689
4690
4691 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4692 def jwt_decode_hs256(jwt):
4693 header_b64, payload_b64, signature_b64 = jwt.split('.')
4694 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4695 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4696 return payload_data
4697
4698
4699 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4700
4701
4702 @functools.cache
4703 def supports_terminal_sequences(stream):
4704 if compat_os_name == 'nt':
4705 if not WINDOWS_VT_MODE:
4706 return False
4707 elif not os.getenv('TERM'):
4708 return False
4709 try:
4710 return stream.isatty()
4711 except BaseException:
4712 return False
4713
4714
4715 def windows_enable_vt_mode():
4716 """Ref: https://bugs.python.org/issue30075 """
4717 if get_windows_version() < (10, 0, 10586):
4718 return
4719
4720 import ctypes
4721 import ctypes.wintypes
4722 import msvcrt
4723
4724 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4725
4726 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4727 handle = os.open('CONOUT$', os.O_RDWR)
4728 try:
4729 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4730 dw_original_mode = ctypes.wintypes.DWORD()
4731 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4732 if not success:
4733 raise Exception('GetConsoleMode failed')
4734
4735 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4736 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4737 if not success:
4738 raise Exception('SetConsoleMode failed')
4739 finally:
4740 os.close(handle)
4741
4742 global WINDOWS_VT_MODE
4743 WINDOWS_VT_MODE = True
4744 supports_terminal_sequences.cache_clear()
4745
4746
4747 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4748
4749
4750 def remove_terminal_sequences(string):
4751 return _terminal_sequences_re.sub('', string)
4752
4753
4754 def number_of_digits(number):
4755 return len('%d' % number)
4756
4757
4758 def join_nonempty(*values, delim='-', from_dict=None):
4759 if from_dict is not None:
4760 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4761 return delim.join(map(str, filter(None, values)))
4762
4763
4764 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4765 """
4766 Find the largest format dimensions in terms of video width and, for each thumbnail:
4767 * Modify the URL: Match the width with the provided regex and replace with the former width
4768 * Update dimensions
4769
4770 This function is useful with video services that scale the provided thumbnails on demand
4771 """
4772 _keys = ('width', 'height')
4773 max_dimensions = max(
4774 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4775 default=(0, 0))
4776 if not max_dimensions[0]:
4777 return thumbnails
4778 return [
4779 merge_dicts(
4780 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4781 dict(zip(_keys, max_dimensions)), thumbnail)
4782 for thumbnail in thumbnails
4783 ]
4784
4785
4786 def parse_http_range(range):
4787 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4788 if not range:
4789 return None, None, None
4790 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4791 if not crg:
4792 return None, None, None
4793 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4794
4795
4796 def read_stdin(what):
4797 if what:
4798 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4799 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4800 return sys.stdin
4801
4802
4803 def determine_file_encoding(data):
4804 """
4805 Detect the text encoding used
4806 @returns (encoding, bytes to skip)
4807 """
4808
4809 # BOM marks are given priority over declarations
4810 for bom, enc in BOMS:
4811 if data.startswith(bom):
4812 return enc, len(bom)
4813
4814 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4815 # We ignore the endianness to get a good enough match
4816 data = data.replace(b'\0', b'')
4817 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4818 return mobj.group(1).decode() if mobj else None, 0
4819
4820
4821 class Config:
4822 own_args = None
4823 parsed_args = None
4824 filename = None
4825 __initialized = False
4826
4827 def __init__(self, parser, label=None):
4828 self.parser, self.label = parser, label
4829 self._loaded_paths, self.configs = set(), []
4830
4831 def init(self, args=None, filename=None):
4832 assert not self.__initialized
4833 self.own_args, self.filename = args, filename
4834 return self.load_configs()
4835
4836 def load_configs(self):
4837 directory = ''
4838 if self.filename:
4839 location = os.path.realpath(self.filename)
4840 directory = os.path.dirname(location)
4841 if location in self._loaded_paths:
4842 return False
4843 self._loaded_paths.add(location)
4844
4845 self.__initialized = True
4846 opts, _ = self.parser.parse_known_args(self.own_args)
4847 self.parsed_args = self.own_args
4848 for location in opts.config_locations or []:
4849 if location == '-':
4850 if location in self._loaded_paths:
4851 continue
4852 self._loaded_paths.add(location)
4853 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4854 continue
4855 location = os.path.join(directory, expand_path(location))
4856 if os.path.isdir(location):
4857 location = os.path.join(location, 'yt-dlp.conf')
4858 if not os.path.exists(location):
4859 self.parser.error(f'config location {location} does not exist')
4860 self.append_config(self.read_file(location), location)
4861 return True
4862
4863 def __str__(self):
4864 label = join_nonempty(
4865 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4866 delim=' ')
4867 return join_nonempty(
4868 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4869 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4870 delim='\n')
4871
4872 @staticmethod
4873 def read_file(filename, default=[]):
4874 try:
4875 optionf = open(filename, 'rb')
4876 except OSError:
4877 return default # silently skip if file is not present
4878 try:
4879 enc, skip = determine_file_encoding(optionf.read(512))
4880 optionf.seek(skip, io.SEEK_SET)
4881 except OSError:
4882 enc = None # silently skip read errors
4883 try:
4884 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4885 contents = optionf.read().decode(enc or preferredencoding())
4886 res = shlex.split(contents, comments=True)
4887 except Exception as err:
4888 raise ValueError(f'Unable to parse "{filename}": {err}')
4889 finally:
4890 optionf.close()
4891 return res
4892
4893 @staticmethod
4894 def hide_login_info(opts):
4895 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4896 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4897
4898 def _scrub_eq(o):
4899 m = eqre.match(o)
4900 if m:
4901 return m.group('key') + '=PRIVATE'
4902 else:
4903 return o
4904
4905 opts = list(map(_scrub_eq, opts))
4906 for idx, opt in enumerate(opts):
4907 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4908 opts[idx + 1] = 'PRIVATE'
4909 return opts
4910
4911 def append_config(self, *args, label=None):
4912 config = type(self)(self.parser, label)
4913 config._loaded_paths = self._loaded_paths
4914 if config.init(*args):
4915 self.configs.append(config)
4916
4917 @property
4918 def all_args(self):
4919 for config in reversed(self.configs):
4920 yield from config.all_args
4921 yield from self.parsed_args or []
4922
4923 def parse_known_args(self, **kwargs):
4924 return self.parser.parse_known_args(self.all_args, **kwargs)
4925
4926 def parse_args(self):
4927 return self.parser.parse_args(self.all_args)
4928
4929
4930 def merge_headers(*dicts):
4931 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4932 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4933
4934
4935 def cached_method(f):
4936 """Cache a method"""
4937 signature = inspect.signature(f)
4938
4939 @functools.wraps(f)
4940 def wrapper(self, *args, **kwargs):
4941 bound_args = signature.bind(self, *args, **kwargs)
4942 bound_args.apply_defaults()
4943 key = tuple(bound_args.arguments.values())[1:]
4944
4945 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4946 if key not in cache:
4947 cache[key] = f(self, *args, **kwargs)
4948 return cache[key]
4949 return wrapper
4950
4951
4952 class classproperty:
4953 """property access for class methods with optional caching"""
4954 def __new__(cls, func=None, *args, **kwargs):
4955 if not func:
4956 return functools.partial(cls, *args, **kwargs)
4957 return super().__new__(cls)
4958
4959 def __init__(self, func, *, cache=False):
4960 functools.update_wrapper(self, func)
4961 self.func = func
4962 self._cache = {} if cache else None
4963
4964 def __get__(self, _, cls):
4965 if self._cache is None:
4966 return self.func(cls)
4967 elif cls not in self._cache:
4968 self._cache[cls] = self.func(cls)
4969 return self._cache[cls]
4970
4971
4972 class function_with_repr:
4973 def __init__(self, func, repr_=None):
4974 functools.update_wrapper(self, func)
4975 self.func, self.__repr = func, repr_
4976
4977 def __call__(self, *args, **kwargs):
4978 return self.func(*args, **kwargs)
4979
4980 def __repr__(self):
4981 if self.__repr:
4982 return self.__repr
4983 return f'{self.func.__module__}.{self.func.__qualname__}'
4984
4985
4986 class Namespace(types.SimpleNamespace):
4987 """Immutable namespace"""
4988
4989 def __iter__(self):
4990 return iter(self.__dict__.values())
4991
4992 @property
4993 def items_(self):
4994 return self.__dict__.items()
4995
4996
4997 MEDIA_EXTENSIONS = Namespace(
4998 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
4999 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5000 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5001 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5002 thumbnails=('jpg', 'png', 'webp'),
5003 storyboards=('mhtml', ),
5004 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5005 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5006 )
5007 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5008 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5009
5010 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5011
5012
5013 class RetryManager:
5014 """Usage:
5015 for retry in RetryManager(...):
5016 try:
5017 ...
5018 except SomeException as err:
5019 retry.error = err
5020 continue
5021 """
5022 attempt, _error = 0, None
5023
5024 def __init__(self, _retries, _error_callback, **kwargs):
5025 self.retries = _retries or 0
5026 self.error_callback = functools.partial(_error_callback, **kwargs)
5027
5028 def _should_retry(self):
5029 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5030
5031 @property
5032 def error(self):
5033 if self._error is NO_DEFAULT:
5034 return None
5035 return self._error
5036
5037 @error.setter
5038 def error(self, value):
5039 self._error = value
5040
5041 def __iter__(self):
5042 while self._should_retry():
5043 self.error = NO_DEFAULT
5044 self.attempt += 1
5045 yield self
5046 if self.error:
5047 self.error_callback(self.error, self.attempt, self.retries)
5048
5049 @staticmethod
5050 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5051 """Utility function for reporting retries"""
5052 if count > retries:
5053 if error:
5054 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5055 raise e
5056
5057 if not count:
5058 return warn(e)
5059 elif isinstance(e, ExtractorError):
5060 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5061 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5062
5063 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5064 if delay:
5065 info(f'Sleeping {delay:.2f} seconds ...')
5066 time.sleep(delay)
5067
5068
5069 def make_archive_id(ie, video_id):
5070 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5071 return f'{ie_key.lower()} {video_id}'
5072
5073
5074 def truncate_string(s, left, right=0):
5075 assert left > 3 and right >= 0
5076 if s is None or len(s) <= left + right:
5077 return s
5078 return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5079
5080
5081 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5082 assert 'all' in alias_dict, '"all" alias is required'
5083 requested = list(start or [])
5084 for val in options:
5085 discard = val.startswith('-')
5086 if discard:
5087 val = val[1:]
5088
5089 if val in alias_dict:
5090 val = alias_dict[val] if not discard else [
5091 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5092 # NB: Do not allow regex in aliases for performance
5093 requested = orderedSet_from_options(val, alias_dict, start=requested)
5094 continue
5095
5096 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5097 else [val] if val in alias_dict['all'] else None)
5098 if current is None:
5099 raise ValueError(val)
5100
5101 if discard:
5102 for item in current:
5103 while item in requested:
5104 requested.remove(item)
5105 else:
5106 requested.extend(current)
5107
5108 return orderedSet(requested)
5109
5110
5111 # TODO: Rewrite
5112 class FormatSorter:
5113 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5114
5115 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5116 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5117 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5118 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5119 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5120 'fps', 'fs_approx', 'source', 'id')
5121
5122 settings = {
5123 'vcodec': {'type': 'ordered', 'regex': True,
5124 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5125 'acodec': {'type': 'ordered', 'regex': True,
5126 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5127 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5128 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5129 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5130 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5131 'vext': {'type': 'ordered', 'field': 'video_ext',
5132 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5133 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5134 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5135 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5136 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5137 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5138 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5139 'field': ('vcodec', 'acodec'),
5140 'function': lambda it: int(any(v != 'none' for v in it))},
5141 'ie_pref': {'priority': True, 'type': 'extractor'},
5142 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5143 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5144 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5145 'quality': {'convert': 'float', 'default': -1},
5146 'filesize': {'convert': 'bytes'},
5147 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5148 'id': {'convert': 'string', 'field': 'format_id'},
5149 'height': {'convert': 'float_none'},
5150 'width': {'convert': 'float_none'},
5151 'fps': {'convert': 'float_none'},
5152 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5153 'tbr': {'convert': 'float_none'},
5154 'vbr': {'convert': 'float_none'},
5155 'abr': {'convert': 'float_none'},
5156 'asr': {'convert': 'float_none'},
5157 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5158
5159 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5160 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5161 'function': lambda it: next(filter(None, it), None)},
5162 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5163 'function': lambda it: next(filter(None, it), None)},
5164 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5165 'res': {'type': 'multiple', 'field': ('height', 'width'),
5166 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5167
5168 # Actual field names
5169 'format_id': {'type': 'alias', 'field': 'id'},
5170 'preference': {'type': 'alias', 'field': 'ie_pref'},
5171 'language_preference': {'type': 'alias', 'field': 'lang'},
5172 'source_preference': {'type': 'alias', 'field': 'source'},
5173 'protocol': {'type': 'alias', 'field': 'proto'},
5174 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5175 'audio_channels': {'type': 'alias', 'field': 'channels'},
5176
5177 # Deprecated
5178 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5179 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5180 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5181 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5182 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5183 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5184 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5185 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5186 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5187 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5188 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5189 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5190 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5191 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5192 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5193 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5194 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5195 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5196 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5197 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5198 }
5199
5200 def __init__(self, ydl, field_preference):
5201 self.ydl = ydl
5202 self._order = []
5203 self.evaluate_params(self.ydl.params, field_preference)
5204 if ydl.params.get('verbose'):
5205 self.print_verbose_info(self.ydl.write_debug)
5206
5207 def _get_field_setting(self, field, key):
5208 if field not in self.settings:
5209 if key in ('forced', 'priority'):
5210 return False
5211 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5212 'deprecated and may be removed in a future version')
5213 self.settings[field] = {}
5214 propObj = self.settings[field]
5215 if key not in propObj:
5216 type = propObj.get('type')
5217 if key == 'field':
5218 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5219 elif key == 'convert':
5220 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5221 else:
5222 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5223 propObj[key] = default
5224 return propObj[key]
5225
5226 def _resolve_field_value(self, field, value, convertNone=False):
5227 if value is None:
5228 if not convertNone:
5229 return None
5230 else:
5231 value = value.lower()
5232 conversion = self._get_field_setting(field, 'convert')
5233 if conversion == 'ignore':
5234 return None
5235 if conversion == 'string':
5236 return value
5237 elif conversion == 'float_none':
5238 return float_or_none(value)
5239 elif conversion == 'bytes':
5240 return parse_bytes(value)
5241 elif conversion == 'order':
5242 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5243 use_regex = self._get_field_setting(field, 'regex')
5244 list_length = len(order_list)
5245 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5246 if use_regex and value is not None:
5247 for i, regex in enumerate(order_list):
5248 if regex and re.match(regex, value):
5249 return list_length - i
5250 return list_length - empty_pos # not in list
5251 else: # not regex or value = None
5252 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5253 else:
5254 if value.isnumeric():
5255 return float(value)
5256 else:
5257 self.settings[field]['convert'] = 'string'
5258 return value
5259
5260 def evaluate_params(self, params, sort_extractor):
5261 self._use_free_order = params.get('prefer_free_formats', False)
5262 self._sort_user = params.get('format_sort', [])
5263 self._sort_extractor = sort_extractor
5264
5265 def add_item(field, reverse, closest, limit_text):
5266 field = field.lower()
5267 if field in self._order:
5268 return
5269 self._order.append(field)
5270 limit = self._resolve_field_value(field, limit_text)
5271 data = {
5272 'reverse': reverse,
5273 'closest': False if limit is None else closest,
5274 'limit_text': limit_text,
5275 'limit': limit}
5276 if field in self.settings:
5277 self.settings[field].update(data)
5278 else:
5279 self.settings[field] = data
5280
5281 sort_list = (
5282 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5283 + (tuple() if params.get('format_sort_force', False)
5284 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5285 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5286
5287 for item in sort_list:
5288 match = re.match(self.regex, item)
5289 if match is None:
5290 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5291 field = match.group('field')
5292 if field is None:
5293 continue
5294 if self._get_field_setting(field, 'type') == 'alias':
5295 alias, field = field, self._get_field_setting(field, 'field')
5296 if self._get_field_setting(alias, 'deprecated'):
5297 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5298 f'be removed in a future version. Please use {field} instead')
5299 reverse = match.group('reverse') is not None
5300 closest = match.group('separator') == '~'
5301 limit_text = match.group('limit')
5302
5303 has_limit = limit_text is not None
5304 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5305 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5306
5307 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5308 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5309 limit_count = len(limits)
5310 for (i, f) in enumerate(fields):
5311 add_item(f, reverse, closest,
5312 limits[i] if i < limit_count
5313 else limits[0] if has_limit and not has_multiple_limits
5314 else None)
5315
5316 def print_verbose_info(self, write_debug):
5317 if self._sort_user:
5318 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5319 if self._sort_extractor:
5320 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5321 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5322 '+' if self._get_field_setting(field, 'reverse') else '', field,
5323 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5324 self._get_field_setting(field, 'limit_text'),
5325 self._get_field_setting(field, 'limit'))
5326 if self._get_field_setting(field, 'limit_text') is not None else '')
5327 for field in self._order if self._get_field_setting(field, 'visible')]))
5328
5329 def _calculate_field_preference_from_value(self, format, field, type, value):
5330 reverse = self._get_field_setting(field, 'reverse')
5331 closest = self._get_field_setting(field, 'closest')
5332 limit = self._get_field_setting(field, 'limit')
5333
5334 if type == 'extractor':
5335 maximum = self._get_field_setting(field, 'max')
5336 if value is None or (maximum is not None and value >= maximum):
5337 value = -1
5338 elif type == 'boolean':
5339 in_list = self._get_field_setting(field, 'in_list')
5340 not_in_list = self._get_field_setting(field, 'not_in_list')
5341 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5342 elif type == 'ordered':
5343 value = self._resolve_field_value(field, value, True)
5344
5345 # try to convert to number
5346 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5347 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5348 if is_num:
5349 value = val_num
5350
5351 return ((-10, 0) if value is None
5352 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5353 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5354 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5355 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5356 else (-1, value, 0))
5357
5358 def _calculate_field_preference(self, format, field):
5359 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5360 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5361 if type == 'multiple':
5362 type = 'field' # Only 'field' is allowed in multiple for now
5363 actual_fields = self._get_field_setting(field, 'field')
5364
5365 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5366 else:
5367 value = get_value(field)
5368 return self._calculate_field_preference_from_value(format, field, type, value)
5369
5370 def calculate_preference(self, format):
5371 # Determine missing protocol
5372 if not format.get('protocol'):
5373 format['protocol'] = determine_protocol(format)
5374
5375 # Determine missing ext
5376 if not format.get('ext') and 'url' in format:
5377 format['ext'] = determine_ext(format['url'])
5378 if format.get('vcodec') == 'none':
5379 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5380 format['video_ext'] = 'none'
5381 else:
5382 format['video_ext'] = format['ext']
5383 format['audio_ext'] = 'none'
5384 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5385 # format['preference'] = -1000
5386
5387 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5388 # HEVC-over-FLV is out-of-spec by FLV's original spec
5389 # ref. https://trac.ffmpeg.org/ticket/6389
5390 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5391 format['preference'] = -100
5392
5393 # Determine missing bitrates
5394 if format.get('vcodec') == 'none':
5395 format['vbr'] = 0
5396 if format.get('acodec') == 'none':
5397 format['abr'] = 0
5398 if not format.get('vbr') and format.get('vcodec') != 'none':
5399 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5400 if not format.get('abr') and format.get('acodec') != 'none':
5401 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5402 if not format.get('tbr'):
5403 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5404
5405 return tuple(self._calculate_field_preference(format, field) for field in self._order)
5406
5407
5408 # XXX: Temporary
5409 class _YDLLogger:
5410 def __init__(self, ydl=None):
5411 self._ydl = ydl
5412
5413 def debug(self, message):
5414 if self._ydl:
5415 self._ydl.write_debug(message)
5416
5417 def info(self, message):
5418 if self._ydl:
5419 self._ydl.to_screen(message)
5420
5421 def warning(self, message, *, once=False):
5422 if self._ydl:
5423 self._ydl.report_warning(message, once)
5424
5425 def error(self, message, *, is_error=True):
5426 if self._ydl:
5427 self._ydl.report_error(message, is_error=is_error)
5428
5429 def stdout(self, message):
5430 if self._ydl:
5431 self._ydl.to_stdout(message)
5432
5433 def stderr(self, message):
5434 if self._ydl:
5435 self._ydl.to_stderr(message)