]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils/_utils.py
Improve 069b2aedae2279668b6051627a81fc4fbd9c146a
[yt-dlp.git] / yt_dlp / utils / _utils.py
1 import base64
2 import binascii
3 import calendar
4 import codecs
5 import collections
6 import collections.abc
7 import contextlib
8 import datetime
9 import email.header
10 import email.utils
11 import errno
12 import hashlib
13 import hmac
14 import html.entities
15 import html.parser
16 import inspect
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import mimetypes
23 import netrc
24 import operator
25 import os
26 import platform
27 import random
28 import re
29 import shlex
30 import socket
31 import ssl
32 import struct
33 import subprocess
34 import sys
35 import tempfile
36 import time
37 import traceback
38 import types
39 import unicodedata
40 import urllib.error
41 import urllib.parse
42 import urllib.request
43 import xml.etree.ElementTree
44
45 from . import traversal
46
47 from ..compat import functools # isort: split
48 from ..compat import (
49 compat_etree_fromstring,
50 compat_expanduser,
51 compat_HTMLParseError,
52 compat_os_name,
53 compat_shlex_quote,
54 )
55 from ..dependencies import xattr
56
57 __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
58
59 # This is not clearly defined otherwise
60 compiled_regex_type = type(re.compile(''))
61
62
63 class NO_DEFAULT:
64 pass
65
66
67 def IDENTITY(x):
68 return x
69
70
71 ENGLISH_MONTH_NAMES = [
72 'January', 'February', 'March', 'April', 'May', 'June',
73 'July', 'August', 'September', 'October', 'November', 'December']
74
75 MONTH_NAMES = {
76 'en': ENGLISH_MONTH_NAMES,
77 'fr': [
78 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
79 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
80 # these follow the genitive grammatical case (dopełniacz)
81 # some websites might be using nominative, which will require another month list
82 # https://en.wikibooks.org/wiki/Polish/Noun_cases
83 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
84 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
85 }
86
87 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
88 TIMEZONE_NAMES = {
89 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
90 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
91 'EST': -5, 'EDT': -4, # Eastern
92 'CST': -6, 'CDT': -5, # Central
93 'MST': -7, 'MDT': -6, # Mountain
94 'PST': -8, 'PDT': -7 # Pacific
95 }
96
97 # needed for sanitizing filenames in restricted mode
98 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
99 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
100 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
101
102 DATE_FORMATS = (
103 '%d %B %Y',
104 '%d %b %Y',
105 '%B %d %Y',
106 '%B %dst %Y',
107 '%B %dnd %Y',
108 '%B %drd %Y',
109 '%B %dth %Y',
110 '%b %d %Y',
111 '%b %dst %Y',
112 '%b %dnd %Y',
113 '%b %drd %Y',
114 '%b %dth %Y',
115 '%b %dst %Y %I:%M',
116 '%b %dnd %Y %I:%M',
117 '%b %drd %Y %I:%M',
118 '%b %dth %Y %I:%M',
119 '%Y %m %d',
120 '%Y-%m-%d',
121 '%Y.%m.%d.',
122 '%Y/%m/%d',
123 '%Y/%m/%d %H:%M',
124 '%Y/%m/%d %H:%M:%S',
125 '%Y%m%d%H%M',
126 '%Y%m%d%H%M%S',
127 '%Y%m%d',
128 '%Y-%m-%d %H:%M',
129 '%Y-%m-%d %H:%M:%S',
130 '%Y-%m-%d %H:%M:%S.%f',
131 '%Y-%m-%d %H:%M:%S:%f',
132 '%d.%m.%Y %H:%M',
133 '%d.%m.%Y %H.%M',
134 '%Y-%m-%dT%H:%M:%SZ',
135 '%Y-%m-%dT%H:%M:%S.%fZ',
136 '%Y-%m-%dT%H:%M:%S.%f0Z',
137 '%Y-%m-%dT%H:%M:%S',
138 '%Y-%m-%dT%H:%M:%S.%f',
139 '%Y-%m-%dT%H:%M',
140 '%b %d %Y at %H:%M',
141 '%b %d %Y at %H:%M:%S',
142 '%B %d %Y at %H:%M',
143 '%B %d %Y at %H:%M:%S',
144 '%H:%M %d-%b-%Y',
145 )
146
147 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
148 DATE_FORMATS_DAY_FIRST.extend([
149 '%d-%m-%Y',
150 '%d.%m.%Y',
151 '%d.%m.%y',
152 '%d/%m/%Y',
153 '%d/%m/%y',
154 '%d/%m/%Y %H:%M:%S',
155 '%d-%m-%Y %H:%M',
156 '%H:%M %d/%m/%Y',
157 ])
158
159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160 DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166 ])
167
168 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
169 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
170
171 NUMBER_RE = r'\d+(?:\.\d+)?'
172
173
174 @functools.cache
175 def preferredencoding():
176 """Get preferred encoding.
177
178 Returns the best encoding scheme for the system, based on
179 locale.getpreferredencoding() and some further tweaks.
180 """
181 try:
182 pref = locale.getpreferredencoding()
183 'TEST'.encode(pref)
184 except Exception:
185 pref = 'UTF-8'
186
187 return pref
188
189
190 def write_json_file(obj, fn):
191 """ Encode obj as JSON and write it to fn, atomically if possible """
192
193 tf = tempfile.NamedTemporaryFile(
194 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
195 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
196
197 try:
198 with tf:
199 json.dump(obj, tf, ensure_ascii=False)
200 if sys.platform == 'win32':
201 # Need to remove existing file on Windows, else os.rename raises
202 # WindowsError or FileExistsError.
203 with contextlib.suppress(OSError):
204 os.unlink(fn)
205 with contextlib.suppress(OSError):
206 mask = os.umask(0)
207 os.umask(mask)
208 os.chmod(tf.name, 0o666 & ~mask)
209 os.rename(tf.name, fn)
210 except Exception:
211 with contextlib.suppress(OSError):
212 os.remove(tf.name)
213 raise
214
215
216 def find_xpath_attr(node, xpath, key, val=None):
217 """ Find the xpath xpath[@key=val] """
218 assert re.match(r'^[a-zA-Z_-]+$', key)
219 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
220 return node.find(expr)
221
222 # On python2.6 the xml.etree.ElementTree.Element methods don't support
223 # the namespace parameter
224
225
226 def xpath_with_ns(path, ns_map):
227 components = [c.split(':') for c in path.split('/')]
228 replaced = []
229 for c in components:
230 if len(c) == 1:
231 replaced.append(c[0])
232 else:
233 ns, tag = c
234 replaced.append('{%s}%s' % (ns_map[ns], tag))
235 return '/'.join(replaced)
236
237
238 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
239 def _find_xpath(xpath):
240 return node.find(xpath)
241
242 if isinstance(xpath, str):
243 n = _find_xpath(xpath)
244 else:
245 for xp in xpath:
246 n = _find_xpath(xp)
247 if n is not None:
248 break
249
250 if n is None:
251 if default is not NO_DEFAULT:
252 return default
253 elif fatal:
254 name = xpath if name is None else name
255 raise ExtractorError('Could not find XML element %s' % name)
256 else:
257 return None
258 return n
259
260
261 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
262 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
263 if n is None or n == default:
264 return n
265 if n.text is None:
266 if default is not NO_DEFAULT:
267 return default
268 elif fatal:
269 name = xpath if name is None else name
270 raise ExtractorError('Could not find XML element\'s text %s' % name)
271 else:
272 return None
273 return n.text
274
275
276 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
277 n = find_xpath_attr(node, xpath, key)
278 if n is None:
279 if default is not NO_DEFAULT:
280 return default
281 elif fatal:
282 name = f'{xpath}[@{key}]' if name is None else name
283 raise ExtractorError('Could not find XML attribute %s' % name)
284 else:
285 return None
286 return n.attrib[key]
287
288
289 def get_element_by_id(id, html, **kwargs):
290 """Return the content of the tag with the specified ID in the passed HTML document"""
291 return get_element_by_attribute('id', id, html, **kwargs)
292
293
294 def get_element_html_by_id(id, html, **kwargs):
295 """Return the html of the tag with the specified ID in the passed HTML document"""
296 return get_element_html_by_attribute('id', id, html, **kwargs)
297
298
299 def get_element_by_class(class_name, html):
300 """Return the content of the first tag with the specified class in the passed HTML document"""
301 retval = get_elements_by_class(class_name, html)
302 return retval[0] if retval else None
303
304
305 def get_element_html_by_class(class_name, html):
306 """Return the html of the first tag with the specified class in the passed HTML document"""
307 retval = get_elements_html_by_class(class_name, html)
308 return retval[0] if retval else None
309
310
311 def get_element_by_attribute(attribute, value, html, **kwargs):
312 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
313 return retval[0] if retval else None
314
315
316 def get_element_html_by_attribute(attribute, value, html, **kargs):
317 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
318 return retval[0] if retval else None
319
320
321 def get_elements_by_class(class_name, html, **kargs):
322 """Return the content of all tags with the specified class in the passed HTML document as a list"""
323 return get_elements_by_attribute(
324 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
325 html, escape_value=False)
326
327
328 def get_elements_html_by_class(class_name, html):
329 """Return the html of all tags with the specified class in the passed HTML document as a list"""
330 return get_elements_html_by_attribute(
331 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
332 html, escape_value=False)
333
334
335 def get_elements_by_attribute(*args, **kwargs):
336 """Return the content of the tag with the specified attribute in the passed HTML document"""
337 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
338
339
340 def get_elements_html_by_attribute(*args, **kwargs):
341 """Return the html of the tag with the specified attribute in the passed HTML document"""
342 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
343
344
345 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
346 """
347 Return the text (content) and the html (whole) of the tag with the specified
348 attribute in the passed HTML document
349 """
350 if not value:
351 return
352
353 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
354
355 value = re.escape(value) if escape_value else value
356
357 partial_element_re = rf'''(?x)
358 <(?P<tag>{tag})
359 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
360 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
361 '''
362
363 for m in re.finditer(partial_element_re, html):
364 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
365
366 yield (
367 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
368 whole
369 )
370
371
372 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
373 """
374 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
375 closing tag for the first opening tag it has encountered, and can be used
376 as a context manager
377 """
378
379 class HTMLBreakOnClosingTagException(Exception):
380 pass
381
382 def __init__(self):
383 self.tagstack = collections.deque()
384 html.parser.HTMLParser.__init__(self)
385
386 def __enter__(self):
387 return self
388
389 def __exit__(self, *_):
390 self.close()
391
392 def close(self):
393 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
394 # so data remains buffered; we no longer have any interest in it, thus
395 # override this method to discard it
396 pass
397
398 def handle_starttag(self, tag, _):
399 self.tagstack.append(tag)
400
401 def handle_endtag(self, tag):
402 if not self.tagstack:
403 raise compat_HTMLParseError('no tags in the stack')
404 while self.tagstack:
405 inner_tag = self.tagstack.pop()
406 if inner_tag == tag:
407 break
408 else:
409 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
410 if not self.tagstack:
411 raise self.HTMLBreakOnClosingTagException()
412
413
414 # XXX: This should be far less strict
415 def get_element_text_and_html_by_tag(tag, html):
416 """
417 For the first element with the specified tag in the passed HTML document
418 return its' content (text) and the whole element (html)
419 """
420 def find_or_raise(haystack, needle, exc):
421 try:
422 return haystack.index(needle)
423 except ValueError:
424 raise exc
425 closing_tag = f'</{tag}>'
426 whole_start = find_or_raise(
427 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
428 content_start = find_or_raise(
429 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
430 content_start += whole_start + 1
431 with HTMLBreakOnClosingTagParser() as parser:
432 parser.feed(html[whole_start:content_start])
433 if not parser.tagstack or parser.tagstack[0] != tag:
434 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
435 offset = content_start
436 while offset < len(html):
437 next_closing_tag_start = find_or_raise(
438 html[offset:], closing_tag,
439 compat_HTMLParseError(f'closing {tag} tag not found'))
440 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
441 try:
442 parser.feed(html[offset:offset + next_closing_tag_end])
443 offset += next_closing_tag_end
444 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
445 return html[content_start:offset + next_closing_tag_start], \
446 html[whole_start:offset + next_closing_tag_end]
447 raise compat_HTMLParseError('unexpected end of html')
448
449
450 class HTMLAttributeParser(html.parser.HTMLParser):
451 """Trivial HTML parser to gather the attributes for a single element"""
452
453 def __init__(self):
454 self.attrs = {}
455 html.parser.HTMLParser.__init__(self)
456
457 def handle_starttag(self, tag, attrs):
458 self.attrs = dict(attrs)
459 raise compat_HTMLParseError('done')
460
461
462 class HTMLListAttrsParser(html.parser.HTMLParser):
463 """HTML parser to gather the attributes for the elements of a list"""
464
465 def __init__(self):
466 html.parser.HTMLParser.__init__(self)
467 self.items = []
468 self._level = 0
469
470 def handle_starttag(self, tag, attrs):
471 if tag == 'li' and self._level == 0:
472 self.items.append(dict(attrs))
473 self._level += 1
474
475 def handle_endtag(self, tag):
476 self._level -= 1
477
478
479 def extract_attributes(html_element):
480 """Given a string for an HTML element such as
481 <el
482 a="foo" B="bar" c="&98;az" d=boz
483 empty= noval entity="&amp;"
484 sq='"' dq="'"
485 >
486 Decode and return a dictionary of attributes.
487 {
488 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
489 'empty': '', 'noval': None, 'entity': '&',
490 'sq': '"', 'dq': '\''
491 }.
492 """
493 parser = HTMLAttributeParser()
494 with contextlib.suppress(compat_HTMLParseError):
495 parser.feed(html_element)
496 parser.close()
497 return parser.attrs
498
499
500 def parse_list(webpage):
501 """Given a string for an series of HTML <li> elements,
502 return a dictionary of their attributes"""
503 parser = HTMLListAttrsParser()
504 parser.feed(webpage)
505 parser.close()
506 return parser.items
507
508
509 def clean_html(html):
510 """Clean an HTML snippet into a readable string"""
511
512 if html is None: # Convenience for sanitizing descriptions etc.
513 return html
514
515 html = re.sub(r'\s+', ' ', html)
516 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
517 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
518 # Strip html tags
519 html = re.sub('<.*?>', '', html)
520 # Replace html entities
521 html = unescapeHTML(html)
522 return html.strip()
523
524
525 class LenientJSONDecoder(json.JSONDecoder):
526 # TODO: Write tests
527 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
528 self.transform_source, self.ignore_extra = transform_source, ignore_extra
529 self._close_attempts = 2 * close_objects
530 super().__init__(*args, **kwargs)
531
532 @staticmethod
533 def _close_object(err):
534 doc = err.doc[:err.pos]
535 # We need to add comma first to get the correct error message
536 if err.msg.startswith('Expecting \',\''):
537 return doc + ','
538 elif not doc.endswith(','):
539 return
540
541 if err.msg.startswith('Expecting property name'):
542 return doc[:-1] + '}'
543 elif err.msg.startswith('Expecting value'):
544 return doc[:-1] + ']'
545
546 def decode(self, s):
547 if self.transform_source:
548 s = self.transform_source(s)
549 for attempt in range(self._close_attempts + 1):
550 try:
551 if self.ignore_extra:
552 return self.raw_decode(s.lstrip())[0]
553 return super().decode(s)
554 except json.JSONDecodeError as e:
555 if e.pos is None:
556 raise
557 elif attempt < self._close_attempts:
558 s = self._close_object(e)
559 if s is not None:
560 continue
561 raise type(e)(f'{e.msg} in {s[e.pos - 10:e.pos + 10]!r}', s, e.pos)
562 assert False, 'Too many attempts to decode JSON'
563
564
565 def sanitize_open(filename, open_mode):
566 """Try to open the given filename, and slightly tweak it if this fails.
567
568 Attempts to open the given filename. If this fails, it tries to change
569 the filename slightly, step by step, until it's either able to open it
570 or it fails and raises a final exception, like the standard open()
571 function.
572
573 It returns the tuple (stream, definitive_file_name).
574 """
575 if filename == '-':
576 if sys.platform == 'win32':
577 import msvcrt
578
579 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
580 with contextlib.suppress(io.UnsupportedOperation):
581 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
582 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
583
584 for attempt in range(2):
585 try:
586 try:
587 if sys.platform == 'win32':
588 # FIXME: An exclusive lock also locks the file from being read.
589 # Since windows locks are mandatory, don't lock the file on windows (for now).
590 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
591 raise LockingUnsupportedError()
592 stream = locked_file(filename, open_mode, block=False).__enter__()
593 except OSError:
594 stream = open(filename, open_mode)
595 return stream, filename
596 except OSError as err:
597 if attempt or err.errno in (errno.EACCES,):
598 raise
599 old_filename, filename = filename, sanitize_path(filename)
600 if old_filename == filename:
601 raise
602
603
604 def timeconvert(timestr):
605 """Convert RFC 2822 defined time string into system timestamp"""
606 timestamp = None
607 timetuple = email.utils.parsedate_tz(timestr)
608 if timetuple is not None:
609 timestamp = email.utils.mktime_tz(timetuple)
610 return timestamp
611
612
613 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
614 """Sanitizes a string so it could be used as part of a filename.
615 @param restricted Use a stricter subset of allowed characters
616 @param is_id Whether this is an ID that should be kept unchanged if possible.
617 If unset, yt-dlp's new sanitization rules are in effect
618 """
619 if s == '':
620 return ''
621
622 def replace_insane(char):
623 if restricted and char in ACCENT_CHARS:
624 return ACCENT_CHARS[char]
625 elif not restricted and char == '\n':
626 return '\0 '
627 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
628 # Replace with their full-width unicode counterparts
629 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
630 elif char == '?' or ord(char) < 32 or ord(char) == 127:
631 return ''
632 elif char == '"':
633 return '' if restricted else '\''
634 elif char == ':':
635 return '\0_\0-' if restricted else '\0 \0-'
636 elif char in '\\/|*<>':
637 return '\0_'
638 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
639 return '' if unicodedata.category(char)[0] in 'CM' else '\0_'
640 return char
641
642 # Replace look-alike Unicode glyphs
643 if restricted and (is_id is NO_DEFAULT or not is_id):
644 s = unicodedata.normalize('NFKC', s)
645 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
646 result = ''.join(map(replace_insane, s))
647 if is_id is NO_DEFAULT:
648 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
649 STRIP_RE = r'(?:\0.|[ _-])*'
650 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
651 result = result.replace('\0', '') or '_'
652
653 if not is_id:
654 while '__' in result:
655 result = result.replace('__', '_')
656 result = result.strip('_')
657 # Common case of "Foreign band name - English song title"
658 if restricted and result.startswith('-_'):
659 result = result[2:]
660 if result.startswith('-'):
661 result = '_' + result[len('-'):]
662 result = result.lstrip('.')
663 if not result:
664 result = '_'
665 return result
666
667
668 def sanitize_path(s, force=False):
669 """Sanitizes and normalizes path on Windows"""
670 # XXX: this handles drive relative paths (c:sth) incorrectly
671 if sys.platform == 'win32':
672 force = False
673 drive_or_unc, _ = os.path.splitdrive(s)
674 elif force:
675 drive_or_unc = ''
676 else:
677 return s
678
679 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
680 if drive_or_unc:
681 norm_path.pop(0)
682 sanitized_path = [
683 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
684 for path_part in norm_path]
685 if drive_or_unc:
686 sanitized_path.insert(0, drive_or_unc + os.path.sep)
687 elif force and s and s[0] == os.path.sep:
688 sanitized_path.insert(0, os.path.sep)
689 # TODO: Fix behavioral differences <3.12
690 # The workaround using `normpath` only superficially passes tests
691 # Ref: https://github.com/python/cpython/pull/100351
692 return os.path.normpath(os.path.join(*sanitized_path))
693
694
695 def sanitize_url(url, *, scheme='http'):
696 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
697 # the number of unwanted failures due to missing protocol
698 if url is None:
699 return
700 elif url.startswith('//'):
701 return f'{scheme}:{url}'
702 # Fix some common typos seen so far
703 COMMON_TYPOS = (
704 # https://github.com/ytdl-org/youtube-dl/issues/15649
705 (r'^httpss://', r'https://'),
706 # https://bx1.be/lives/direct-tv/
707 (r'^rmtp([es]?)://', r'rtmp\1://'),
708 )
709 for mistake, fixup in COMMON_TYPOS:
710 if re.match(mistake, url):
711 return re.sub(mistake, fixup, url)
712 return url
713
714
715 def extract_basic_auth(url):
716 parts = urllib.parse.urlsplit(url)
717 if parts.username is None:
718 return url, None
719 url = urllib.parse.urlunsplit(parts._replace(netloc=(
720 parts.hostname if parts.port is None
721 else '%s:%d' % (parts.hostname, parts.port))))
722 auth_payload = base64.b64encode(
723 ('%s:%s' % (parts.username, parts.password or '')).encode())
724 return url, f'Basic {auth_payload.decode()}'
725
726
727 def expand_path(s):
728 """Expand shell variables and ~"""
729 return os.path.expandvars(compat_expanduser(s))
730
731
732 def orderedSet(iterable, *, lazy=False):
733 """Remove all duplicates from the input iterable"""
734 def _iter():
735 seen = [] # Do not use set since the items can be unhashable
736 for x in iterable:
737 if x not in seen:
738 seen.append(x)
739 yield x
740
741 return _iter() if lazy else list(_iter())
742
743
744 def _htmlentity_transform(entity_with_semicolon):
745 """Transforms an HTML entity to a character."""
746 entity = entity_with_semicolon[:-1]
747
748 # Known non-numeric HTML entity
749 if entity in html.entities.name2codepoint:
750 return chr(html.entities.name2codepoint[entity])
751
752 # TODO: HTML5 allows entities without a semicolon.
753 # E.g. '&Eacuteric' should be decoded as 'Éric'.
754 if entity_with_semicolon in html.entities.html5:
755 return html.entities.html5[entity_with_semicolon]
756
757 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
758 if mobj is not None:
759 numstr = mobj.group(1)
760 if numstr.startswith('x'):
761 base = 16
762 numstr = '0%s' % numstr
763 else:
764 base = 10
765 # See https://github.com/ytdl-org/youtube-dl/issues/7518
766 with contextlib.suppress(ValueError):
767 return chr(int(numstr, base))
768
769 # Unknown entity in name, return its literal representation
770 return '&%s;' % entity
771
772
773 def unescapeHTML(s):
774 if s is None:
775 return None
776 assert isinstance(s, str)
777
778 return re.sub(
779 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
780
781
782 def escapeHTML(text):
783 return (
784 text
785 .replace('&', '&amp;')
786 .replace('<', '&lt;')
787 .replace('>', '&gt;')
788 .replace('"', '&quot;')
789 .replace("'", '&#39;')
790 )
791
792
793 class netrc_from_content(netrc.netrc):
794 def __init__(self, content):
795 self.hosts, self.macros = {}, {}
796 with io.StringIO(content) as stream:
797 self._parse('-', stream, False)
798
799
800 class Popen(subprocess.Popen):
801 if sys.platform == 'win32':
802 _startupinfo = subprocess.STARTUPINFO()
803 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
804 else:
805 _startupinfo = None
806
807 @staticmethod
808 def _fix_pyinstaller_ld_path(env):
809 """Restore LD_LIBRARY_PATH when using PyInstaller
810 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
811 https://github.com/yt-dlp/yt-dlp/issues/4573
812 """
813 if not hasattr(sys, '_MEIPASS'):
814 return
815
816 def _fix(key):
817 orig = env.get(f'{key}_ORIG')
818 if orig is None:
819 env.pop(key, None)
820 else:
821 env[key] = orig
822
823 _fix('LD_LIBRARY_PATH') # Linux
824 _fix('DYLD_LIBRARY_PATH') # macOS
825
826 def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs):
827 if env is None:
828 env = os.environ.copy()
829 self._fix_pyinstaller_ld_path(env)
830
831 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
832 if text is True:
833 kwargs['universal_newlines'] = True # For 3.6 compatibility
834 kwargs.setdefault('encoding', 'utf-8')
835 kwargs.setdefault('errors', 'replace')
836
837 if shell and compat_os_name == 'nt' and kwargs.get('executable') is None:
838 if not isinstance(args, str):
839 args = ' '.join(compat_shlex_quote(a) for a in args)
840 shell = False
841 args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"'
842
843 super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo)
844
845 def __comspec(self):
846 comspec = os.environ.get('ComSpec') or os.path.join(
847 os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe')
848 if os.path.isabs(comspec):
849 return comspec
850 raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set')
851
852 def communicate_or_kill(self, *args, **kwargs):
853 try:
854 return self.communicate(*args, **kwargs)
855 except BaseException: # Including KeyboardInterrupt
856 self.kill(timeout=None)
857 raise
858
859 def kill(self, *, timeout=0):
860 super().kill()
861 if timeout != 0:
862 self.wait(timeout=timeout)
863
864 @classmethod
865 def run(cls, *args, timeout=None, **kwargs):
866 with cls(*args, **kwargs) as proc:
867 default = '' if proc.__text_mode else b''
868 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
869 return stdout or default, stderr or default, proc.returncode
870
871
872 def encodeArgument(s):
873 # Legacy code that uses byte strings
874 # Uncomment the following line after fixing all post processors
875 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
876 return s if isinstance(s, str) else s.decode('ascii')
877
878
879 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
880
881
882 def timetuple_from_msec(msec):
883 secs, msec = divmod(msec, 1000)
884 mins, secs = divmod(secs, 60)
885 hrs, mins = divmod(mins, 60)
886 return _timetuple(hrs, mins, secs, msec)
887
888
889 def formatSeconds(secs, delim=':', msec=False):
890 time = timetuple_from_msec(secs * 1000)
891 if time.hours:
892 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
893 elif time.minutes:
894 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
895 else:
896 ret = '%d' % time.seconds
897 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
898
899
900 def bug_reports_message(before=';'):
901 from ..update import REPOSITORY
902
903 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
904 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
905
906 before = before.rstrip()
907 if not before or before.endswith(('.', '!', '?')):
908 msg = msg[0].title() + msg[1:]
909
910 return (before + ' ' if before else '') + msg
911
912
913 class YoutubeDLError(Exception):
914 """Base exception for YoutubeDL errors."""
915 msg = None
916
917 def __init__(self, msg=None):
918 if msg is not None:
919 self.msg = msg
920 elif self.msg is None:
921 self.msg = type(self).__name__
922 super().__init__(self.msg)
923
924
925 class ExtractorError(YoutubeDLError):
926 """Error during info extraction."""
927
928 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
929 """ tb, if given, is the original traceback (so that it can be printed out).
930 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
931 """
932 from ..networking.exceptions import network_exceptions
933 if sys.exc_info()[0] in network_exceptions:
934 expected = True
935
936 self.orig_msg = str(msg)
937 self.traceback = tb
938 self.expected = expected
939 self.cause = cause
940 self.video_id = video_id
941 self.ie = ie
942 self.exc_info = sys.exc_info() # preserve original exception
943 if isinstance(self.exc_info[1], ExtractorError):
944 self.exc_info = self.exc_info[1].exc_info
945 super().__init__(self.__msg)
946
947 @property
948 def __msg(self):
949 return ''.join((
950 format_field(self.ie, None, '[%s] '),
951 format_field(self.video_id, None, '%s: '),
952 self.orig_msg,
953 format_field(self.cause, None, ' (caused by %r)'),
954 '' if self.expected else bug_reports_message()))
955
956 def format_traceback(self):
957 return join_nonempty(
958 self.traceback and ''.join(traceback.format_tb(self.traceback)),
959 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
960 delim='\n') or None
961
962 def __setattr__(self, name, value):
963 super().__setattr__(name, value)
964 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
965 self.msg = self.__msg or type(self).__name__
966 self.args = (self.msg, ) # Cannot be property
967
968
969 class UnsupportedError(ExtractorError):
970 def __init__(self, url):
971 super().__init__(
972 'Unsupported URL: %s' % url, expected=True)
973 self.url = url
974
975
976 class RegexNotFoundError(ExtractorError):
977 """Error when a regex didn't match"""
978 pass
979
980
981 class GeoRestrictedError(ExtractorError):
982 """Geographic restriction Error exception.
983
984 This exception may be thrown when a video is not available from your
985 geographic location due to geographic restrictions imposed by a website.
986 """
987
988 def __init__(self, msg, countries=None, **kwargs):
989 kwargs['expected'] = True
990 super().__init__(msg, **kwargs)
991 self.countries = countries
992
993
994 class UserNotLive(ExtractorError):
995 """Error when a channel/user is not live"""
996
997 def __init__(self, msg=None, **kwargs):
998 kwargs['expected'] = True
999 super().__init__(msg or 'The channel is not currently live', **kwargs)
1000
1001
1002 class DownloadError(YoutubeDLError):
1003 """Download Error exception.
1004
1005 This exception may be thrown by FileDownloader objects if they are not
1006 configured to continue on errors. They will contain the appropriate
1007 error message.
1008 """
1009
1010 def __init__(self, msg, exc_info=None):
1011 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1012 super().__init__(msg)
1013 self.exc_info = exc_info
1014
1015
1016 class EntryNotInPlaylist(YoutubeDLError):
1017 """Entry not in playlist exception.
1018
1019 This exception will be thrown by YoutubeDL when a requested entry
1020 is not found in the playlist info_dict
1021 """
1022 msg = 'Entry not found in info'
1023
1024
1025 class SameFileError(YoutubeDLError):
1026 """Same File exception.
1027
1028 This exception will be thrown by FileDownloader objects if they detect
1029 multiple files would have to be downloaded to the same file on disk.
1030 """
1031 msg = 'Fixed output name but more than one file to download'
1032
1033 def __init__(self, filename=None):
1034 if filename is not None:
1035 self.msg += f': {filename}'
1036 super().__init__(self.msg)
1037
1038
1039 class PostProcessingError(YoutubeDLError):
1040 """Post Processing exception.
1041
1042 This exception may be raised by PostProcessor's .run() method to
1043 indicate an error in the postprocessing task.
1044 """
1045
1046
1047 class DownloadCancelled(YoutubeDLError):
1048 """ Exception raised when the download queue should be interrupted """
1049 msg = 'The download was cancelled'
1050
1051
1052 class ExistingVideoReached(DownloadCancelled):
1053 """ --break-on-existing triggered """
1054 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1055
1056
1057 class RejectedVideoReached(DownloadCancelled):
1058 """ --break-match-filter triggered """
1059 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1060
1061
1062 class MaxDownloadsReached(DownloadCancelled):
1063 """ --max-downloads limit has been reached. """
1064 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1065
1066
1067 class ReExtractInfo(YoutubeDLError):
1068 """ Video info needs to be re-extracted. """
1069
1070 def __init__(self, msg, expected=False):
1071 super().__init__(msg)
1072 self.expected = expected
1073
1074
1075 class ThrottledDownload(ReExtractInfo):
1076 """ Download speed below --throttled-rate. """
1077 msg = 'The download speed is below throttle limit'
1078
1079 def __init__(self):
1080 super().__init__(self.msg, expected=False)
1081
1082
1083 class UnavailableVideoError(YoutubeDLError):
1084 """Unavailable Format exception.
1085
1086 This exception will be thrown when a video is requested
1087 in a format that is not available for that video.
1088 """
1089 msg = 'Unable to download video'
1090
1091 def __init__(self, err=None):
1092 if err is not None:
1093 self.msg += f': {err}'
1094 super().__init__(self.msg)
1095
1096
1097 class ContentTooShortError(YoutubeDLError):
1098 """Content Too Short exception.
1099
1100 This exception may be raised by FileDownloader objects when a file they
1101 download is too small for what the server announced first, indicating
1102 the connection was probably interrupted.
1103 """
1104
1105 def __init__(self, downloaded, expected):
1106 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1107 # Both in bytes
1108 self.downloaded = downloaded
1109 self.expected = expected
1110
1111
1112 class XAttrMetadataError(YoutubeDLError):
1113 def __init__(self, code=None, msg='Unknown error'):
1114 super().__init__(msg)
1115 self.code = code
1116 self.msg = msg
1117
1118 # Parsing code and msg
1119 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1120 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1121 self.reason = 'NO_SPACE'
1122 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1123 self.reason = 'VALUE_TOO_LONG'
1124 else:
1125 self.reason = 'NOT_SUPPORTED'
1126
1127
1128 class XAttrUnavailableError(YoutubeDLError):
1129 pass
1130
1131
1132 def is_path_like(f):
1133 return isinstance(f, (str, bytes, os.PathLike))
1134
1135
1136 def extract_timezone(date_str):
1137 m = re.search(
1138 r'''(?x)
1139 ^.{8,}? # >=8 char non-TZ prefix, if present
1140 (?P<tz>Z| # just the UTC Z, or
1141 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1142 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1143 [ ]? # optional space
1144 (?P<sign>\+|-) # +/-
1145 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1146 $)
1147 ''', date_str)
1148 if not m:
1149 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1150 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1151 if timezone is not None:
1152 date_str = date_str[:-len(m.group('tz'))]
1153 timezone = datetime.timedelta(hours=timezone or 0)
1154 else:
1155 date_str = date_str[:-len(m.group('tz'))]
1156 if not m.group('sign'):
1157 timezone = datetime.timedelta()
1158 else:
1159 sign = 1 if m.group('sign') == '+' else -1
1160 timezone = datetime.timedelta(
1161 hours=sign * int(m.group('hours')),
1162 minutes=sign * int(m.group('minutes')))
1163 return timezone, date_str
1164
1165
1166 def parse_iso8601(date_str, delimiter='T', timezone=None):
1167 """ Return a UNIX timestamp from the given date """
1168
1169 if date_str is None:
1170 return None
1171
1172 date_str = re.sub(r'\.[0-9]+', '', date_str)
1173
1174 if timezone is None:
1175 timezone, date_str = extract_timezone(date_str)
1176
1177 with contextlib.suppress(ValueError):
1178 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1179 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1180 return calendar.timegm(dt.timetuple())
1181
1182
1183 def date_formats(day_first=True):
1184 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1185
1186
1187 def unified_strdate(date_str, day_first=True):
1188 """Return a string with the date in the format YYYYMMDD"""
1189
1190 if date_str is None:
1191 return None
1192 upload_date = None
1193 # Replace commas
1194 date_str = date_str.replace(',', ' ')
1195 # Remove AM/PM + timezone
1196 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1197 _, date_str = extract_timezone(date_str)
1198
1199 for expression in date_formats(day_first):
1200 with contextlib.suppress(ValueError):
1201 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1202 if upload_date is None:
1203 timetuple = email.utils.parsedate_tz(date_str)
1204 if timetuple:
1205 with contextlib.suppress(ValueError):
1206 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1207 if upload_date is not None:
1208 return str(upload_date)
1209
1210
1211 def unified_timestamp(date_str, day_first=True):
1212 if not isinstance(date_str, str):
1213 return None
1214
1215 date_str = re.sub(r'\s+', ' ', re.sub(
1216 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1217
1218 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1219 timezone, date_str = extract_timezone(date_str)
1220
1221 # Remove AM/PM + timezone
1222 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1223
1224 # Remove unrecognized timezones from ISO 8601 alike timestamps
1225 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1226 if m:
1227 date_str = date_str[:-len(m.group('tz'))]
1228
1229 # Python only supports microseconds, so remove nanoseconds
1230 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1231 if m:
1232 date_str = m.group(1)
1233
1234 for expression in date_formats(day_first):
1235 with contextlib.suppress(ValueError):
1236 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1237 return calendar.timegm(dt.timetuple())
1238
1239 timetuple = email.utils.parsedate_tz(date_str)
1240 if timetuple:
1241 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1242
1243
1244 def determine_ext(url, default_ext='unknown_video'):
1245 if url is None or '.' not in url:
1246 return default_ext
1247 guess = url.partition('?')[0].rpartition('.')[2]
1248 if re.match(r'^[A-Za-z0-9]+$', guess):
1249 return guess
1250 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1251 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1252 return guess.rstrip('/')
1253 else:
1254 return default_ext
1255
1256
1257 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1258 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1259
1260
1261 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1262 R"""
1263 Return a datetime object from a string.
1264 Supported format:
1265 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1266
1267 @param format strftime format of DATE
1268 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1269 auto: round to the unit provided in date_str (if applicable).
1270 """
1271 auto_precision = False
1272 if precision == 'auto':
1273 auto_precision = True
1274 precision = 'microsecond'
1275 today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision)
1276 if date_str in ('now', 'today'):
1277 return today
1278 if date_str == 'yesterday':
1279 return today - datetime.timedelta(days=1)
1280 match = re.match(
1281 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1282 date_str)
1283 if match is not None:
1284 start_time = datetime_from_str(match.group('start'), precision, format)
1285 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1286 unit = match.group('unit')
1287 if unit == 'month' or unit == 'year':
1288 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1289 unit = 'day'
1290 else:
1291 if unit == 'week':
1292 unit = 'day'
1293 time *= 7
1294 delta = datetime.timedelta(**{unit + 's': time})
1295 new_date = start_time + delta
1296 if auto_precision:
1297 return datetime_round(new_date, unit)
1298 return new_date
1299
1300 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1301
1302
1303 def date_from_str(date_str, format='%Y%m%d', strict=False):
1304 R"""
1305 Return a date object from a string using datetime_from_str
1306
1307 @param strict Restrict allowed patterns to "YYYYMMDD" and
1308 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1309 """
1310 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1311 raise ValueError(f'Invalid date format "{date_str}"')
1312 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1313
1314
1315 def datetime_add_months(dt, months):
1316 """Increment/Decrement a datetime object by months."""
1317 month = dt.month + months - 1
1318 year = dt.year + month // 12
1319 month = month % 12 + 1
1320 day = min(dt.day, calendar.monthrange(year, month)[1])
1321 return dt.replace(year, month, day)
1322
1323
1324 def datetime_round(dt, precision='day'):
1325 """
1326 Round a datetime object's time to a specific precision
1327 """
1328 if precision == 'microsecond':
1329 return dt
1330
1331 unit_seconds = {
1332 'day': 86400,
1333 'hour': 3600,
1334 'minute': 60,
1335 'second': 1,
1336 }
1337 roundto = lambda x, n: ((x + n / 2) // n) * n
1338 timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision])
1339 return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
1340
1341
1342 def hyphenate_date(date_str):
1343 """
1344 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1345 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1346 if match is not None:
1347 return '-'.join(match.groups())
1348 else:
1349 return date_str
1350
1351
1352 class DateRange:
1353 """Represents a time interval between two dates"""
1354
1355 def __init__(self, start=None, end=None):
1356 """start and end must be strings in the format accepted by date"""
1357 if start is not None:
1358 self.start = date_from_str(start, strict=True)
1359 else:
1360 self.start = datetime.datetime.min.date()
1361 if end is not None:
1362 self.end = date_from_str(end, strict=True)
1363 else:
1364 self.end = datetime.datetime.max.date()
1365 if self.start > self.end:
1366 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1367
1368 @classmethod
1369 def day(cls, day):
1370 """Returns a range that only contains the given day"""
1371 return cls(day, day)
1372
1373 def __contains__(self, date):
1374 """Check if the date is in the range"""
1375 if not isinstance(date, datetime.date):
1376 date = date_from_str(date)
1377 return self.start <= date <= self.end
1378
1379 def __repr__(self):
1380 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1381
1382 def __eq__(self, other):
1383 return (isinstance(other, DateRange)
1384 and self.start == other.start and self.end == other.end)
1385
1386
1387 @functools.cache
1388 def system_identifier():
1389 python_implementation = platform.python_implementation()
1390 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1391 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1392 libc_ver = []
1393 with contextlib.suppress(OSError): # We may not have access to the executable
1394 libc_ver = platform.libc_ver()
1395
1396 return 'Python %s (%s %s %s) - %s (%s%s)' % (
1397 platform.python_version(),
1398 python_implementation,
1399 platform.machine(),
1400 platform.architecture()[0],
1401 platform.platform(),
1402 ssl.OPENSSL_VERSION,
1403 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1404 )
1405
1406
1407 @functools.cache
1408 def get_windows_version():
1409 ''' Get Windows version. returns () if it's not running on Windows '''
1410 if compat_os_name == 'nt':
1411 return version_tuple(platform.win32_ver()[1])
1412 else:
1413 return ()
1414
1415
1416 def write_string(s, out=None, encoding=None):
1417 assert isinstance(s, str)
1418 out = out or sys.stderr
1419 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1420 if not out:
1421 return
1422
1423 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1424 s = re.sub(r'([\r\n]+)', r' \1', s)
1425
1426 enc, buffer = None, out
1427 if 'b' in getattr(out, 'mode', ''):
1428 enc = encoding or preferredencoding()
1429 elif hasattr(out, 'buffer'):
1430 buffer = out.buffer
1431 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1432
1433 buffer.write(s.encode(enc, 'ignore') if enc else s)
1434 out.flush()
1435
1436
1437 # TODO: Use global logger
1438 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1439 from .. import _IN_CLI
1440 if _IN_CLI:
1441 if msg in deprecation_warning._cache:
1442 return
1443 deprecation_warning._cache.add(msg)
1444 if printer:
1445 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1446 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1447 else:
1448 import warnings
1449 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1450
1451
1452 deprecation_warning._cache = set()
1453
1454
1455 def bytes_to_intlist(bs):
1456 if not bs:
1457 return []
1458 if isinstance(bs[0], int): # Python 3
1459 return list(bs)
1460 else:
1461 return [ord(c) for c in bs]
1462
1463
1464 def intlist_to_bytes(xs):
1465 if not xs:
1466 return b''
1467 return struct.pack('%dB' % len(xs), *xs)
1468
1469
1470 class LockingUnsupportedError(OSError):
1471 msg = 'File locking is not supported'
1472
1473 def __init__(self):
1474 super().__init__(self.msg)
1475
1476
1477 # Cross-platform file locking
1478 if sys.platform == 'win32':
1479 import ctypes
1480 import ctypes.wintypes
1481 import msvcrt
1482
1483 class OVERLAPPED(ctypes.Structure):
1484 _fields_ = [
1485 ('Internal', ctypes.wintypes.LPVOID),
1486 ('InternalHigh', ctypes.wintypes.LPVOID),
1487 ('Offset', ctypes.wintypes.DWORD),
1488 ('OffsetHigh', ctypes.wintypes.DWORD),
1489 ('hEvent', ctypes.wintypes.HANDLE),
1490 ]
1491
1492 kernel32 = ctypes.WinDLL('kernel32')
1493 LockFileEx = kernel32.LockFileEx
1494 LockFileEx.argtypes = [
1495 ctypes.wintypes.HANDLE, # hFile
1496 ctypes.wintypes.DWORD, # dwFlags
1497 ctypes.wintypes.DWORD, # dwReserved
1498 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1499 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1500 ctypes.POINTER(OVERLAPPED) # Overlapped
1501 ]
1502 LockFileEx.restype = ctypes.wintypes.BOOL
1503 UnlockFileEx = kernel32.UnlockFileEx
1504 UnlockFileEx.argtypes = [
1505 ctypes.wintypes.HANDLE, # hFile
1506 ctypes.wintypes.DWORD, # dwReserved
1507 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1508 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1509 ctypes.POINTER(OVERLAPPED) # Overlapped
1510 ]
1511 UnlockFileEx.restype = ctypes.wintypes.BOOL
1512 whole_low = 0xffffffff
1513 whole_high = 0x7fffffff
1514
1515 def _lock_file(f, exclusive, block):
1516 overlapped = OVERLAPPED()
1517 overlapped.Offset = 0
1518 overlapped.OffsetHigh = 0
1519 overlapped.hEvent = 0
1520 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1521
1522 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1523 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1524 0, whole_low, whole_high, f._lock_file_overlapped_p):
1525 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1526 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1527
1528 def _unlock_file(f):
1529 assert f._lock_file_overlapped_p
1530 handle = msvcrt.get_osfhandle(f.fileno())
1531 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1532 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1533
1534 else:
1535 try:
1536 import fcntl
1537
1538 def _lock_file(f, exclusive, block):
1539 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1540 if not block:
1541 flags |= fcntl.LOCK_NB
1542 try:
1543 fcntl.flock(f, flags)
1544 except BlockingIOError:
1545 raise
1546 except OSError: # AOSP does not have flock()
1547 fcntl.lockf(f, flags)
1548
1549 def _unlock_file(f):
1550 with contextlib.suppress(OSError):
1551 return fcntl.flock(f, fcntl.LOCK_UN)
1552 with contextlib.suppress(OSError):
1553 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1554 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
1555
1556 except ImportError:
1557
1558 def _lock_file(f, exclusive, block):
1559 raise LockingUnsupportedError()
1560
1561 def _unlock_file(f):
1562 raise LockingUnsupportedError()
1563
1564
1565 class locked_file:
1566 locked = False
1567
1568 def __init__(self, filename, mode, block=True, encoding=None):
1569 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1570 raise NotImplementedError(mode)
1571 self.mode, self.block = mode, block
1572
1573 writable = any(f in mode for f in 'wax+')
1574 readable = any(f in mode for f in 'r+')
1575 flags = functools.reduce(operator.ior, (
1576 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1577 getattr(os, 'O_BINARY', 0), # Windows only
1578 getattr(os, 'O_NOINHERIT', 0), # Windows only
1579 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1580 os.O_APPEND if 'a' in mode else 0,
1581 os.O_EXCL if 'x' in mode else 0,
1582 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1583 ))
1584
1585 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1586
1587 def __enter__(self):
1588 exclusive = 'r' not in self.mode
1589 try:
1590 _lock_file(self.f, exclusive, self.block)
1591 self.locked = True
1592 except OSError:
1593 self.f.close()
1594 raise
1595 if 'w' in self.mode:
1596 try:
1597 self.f.truncate()
1598 except OSError as e:
1599 if e.errno not in (
1600 errno.ESPIPE, # Illegal seek - expected for FIFO
1601 errno.EINVAL, # Invalid argument - expected for /dev/null
1602 ):
1603 raise
1604 return self
1605
1606 def unlock(self):
1607 if not self.locked:
1608 return
1609 try:
1610 _unlock_file(self.f)
1611 finally:
1612 self.locked = False
1613
1614 def __exit__(self, *_):
1615 try:
1616 self.unlock()
1617 finally:
1618 self.f.close()
1619
1620 open = __enter__
1621 close = __exit__
1622
1623 def __getattr__(self, attr):
1624 return getattr(self.f, attr)
1625
1626 def __iter__(self):
1627 return iter(self.f)
1628
1629
1630 @functools.cache
1631 def get_filesystem_encoding():
1632 encoding = sys.getfilesystemencoding()
1633 return encoding if encoding is not None else 'utf-8'
1634
1635
1636 def shell_quote(args):
1637 quoted_args = []
1638 encoding = get_filesystem_encoding()
1639 for a in args:
1640 if isinstance(a, bytes):
1641 # We may get a filename encoded with 'encodeFilename'
1642 a = a.decode(encoding)
1643 quoted_args.append(compat_shlex_quote(a))
1644 return ' '.join(quoted_args)
1645
1646
1647 def smuggle_url(url, data):
1648 """ Pass additional data in a URL for internal use. """
1649
1650 url, idata = unsmuggle_url(url, {})
1651 data.update(idata)
1652 sdata = urllib.parse.urlencode(
1653 {'__youtubedl_smuggle': json.dumps(data)})
1654 return url + '#' + sdata
1655
1656
1657 def unsmuggle_url(smug_url, default=None):
1658 if '#__youtubedl_smuggle' not in smug_url:
1659 return smug_url, default
1660 url, _, sdata = smug_url.rpartition('#')
1661 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1662 data = json.loads(jsond)
1663 return url, data
1664
1665
1666 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1667 """ Formats numbers with decimal sufixes like K, M, etc """
1668 num, factor = float_or_none(num), float(factor)
1669 if num is None or num < 0:
1670 return None
1671 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1672 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1673 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1674 if factor == 1024:
1675 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1676 converted = num / (factor ** exponent)
1677 return fmt % (converted, suffix)
1678
1679
1680 def format_bytes(bytes):
1681 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1682
1683
1684 def lookup_unit_table(unit_table, s, strict=False):
1685 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1686 units_re = '|'.join(re.escape(u) for u in unit_table)
1687 m = (re.fullmatch if strict else re.match)(
1688 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1689 if not m:
1690 return None
1691
1692 num = float(m.group('num').replace(',', '.'))
1693 mult = unit_table[m.group('unit')]
1694 return round(num * mult)
1695
1696
1697 def parse_bytes(s):
1698 """Parse a string indicating a byte quantity into an integer"""
1699 return lookup_unit_table(
1700 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1701 s.upper(), strict=True)
1702
1703
1704 def parse_filesize(s):
1705 if s is None:
1706 return None
1707
1708 # The lower-case forms are of course incorrect and unofficial,
1709 # but we support those too
1710 _UNIT_TABLE = {
1711 'B': 1,
1712 'b': 1,
1713 'bytes': 1,
1714 'KiB': 1024,
1715 'KB': 1000,
1716 'kB': 1024,
1717 'Kb': 1000,
1718 'kb': 1000,
1719 'kilobytes': 1000,
1720 'kibibytes': 1024,
1721 'MiB': 1024 ** 2,
1722 'MB': 1000 ** 2,
1723 'mB': 1024 ** 2,
1724 'Mb': 1000 ** 2,
1725 'mb': 1000 ** 2,
1726 'megabytes': 1000 ** 2,
1727 'mebibytes': 1024 ** 2,
1728 'GiB': 1024 ** 3,
1729 'GB': 1000 ** 3,
1730 'gB': 1024 ** 3,
1731 'Gb': 1000 ** 3,
1732 'gb': 1000 ** 3,
1733 'gigabytes': 1000 ** 3,
1734 'gibibytes': 1024 ** 3,
1735 'TiB': 1024 ** 4,
1736 'TB': 1000 ** 4,
1737 'tB': 1024 ** 4,
1738 'Tb': 1000 ** 4,
1739 'tb': 1000 ** 4,
1740 'terabytes': 1000 ** 4,
1741 'tebibytes': 1024 ** 4,
1742 'PiB': 1024 ** 5,
1743 'PB': 1000 ** 5,
1744 'pB': 1024 ** 5,
1745 'Pb': 1000 ** 5,
1746 'pb': 1000 ** 5,
1747 'petabytes': 1000 ** 5,
1748 'pebibytes': 1024 ** 5,
1749 'EiB': 1024 ** 6,
1750 'EB': 1000 ** 6,
1751 'eB': 1024 ** 6,
1752 'Eb': 1000 ** 6,
1753 'eb': 1000 ** 6,
1754 'exabytes': 1000 ** 6,
1755 'exbibytes': 1024 ** 6,
1756 'ZiB': 1024 ** 7,
1757 'ZB': 1000 ** 7,
1758 'zB': 1024 ** 7,
1759 'Zb': 1000 ** 7,
1760 'zb': 1000 ** 7,
1761 'zettabytes': 1000 ** 7,
1762 'zebibytes': 1024 ** 7,
1763 'YiB': 1024 ** 8,
1764 'YB': 1000 ** 8,
1765 'yB': 1024 ** 8,
1766 'Yb': 1000 ** 8,
1767 'yb': 1000 ** 8,
1768 'yottabytes': 1000 ** 8,
1769 'yobibytes': 1024 ** 8,
1770 }
1771
1772 return lookup_unit_table(_UNIT_TABLE, s)
1773
1774
1775 def parse_count(s):
1776 if s is None:
1777 return None
1778
1779 s = re.sub(r'^[^\d]+\s', '', s).strip()
1780
1781 if re.match(r'^[\d,.]+$', s):
1782 return str_to_int(s)
1783
1784 _UNIT_TABLE = {
1785 'k': 1000,
1786 'K': 1000,
1787 'm': 1000 ** 2,
1788 'M': 1000 ** 2,
1789 'kk': 1000 ** 2,
1790 'KK': 1000 ** 2,
1791 'b': 1000 ** 3,
1792 'B': 1000 ** 3,
1793 }
1794
1795 ret = lookup_unit_table(_UNIT_TABLE, s)
1796 if ret is not None:
1797 return ret
1798
1799 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1800 if mobj:
1801 return str_to_int(mobj.group(1))
1802
1803
1804 def parse_resolution(s, *, lenient=False):
1805 if s is None:
1806 return {}
1807
1808 if lenient:
1809 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1810 else:
1811 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1812 if mobj:
1813 return {
1814 'width': int(mobj.group('w')),
1815 'height': int(mobj.group('h')),
1816 }
1817
1818 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1819 if mobj:
1820 return {'height': int(mobj.group(1))}
1821
1822 mobj = re.search(r'\b([48])[kK]\b', s)
1823 if mobj:
1824 return {'height': int(mobj.group(1)) * 540}
1825
1826 return {}
1827
1828
1829 def parse_bitrate(s):
1830 if not isinstance(s, str):
1831 return
1832 mobj = re.search(r'\b(\d+)\s*kbps', s)
1833 if mobj:
1834 return int(mobj.group(1))
1835
1836
1837 def month_by_name(name, lang='en'):
1838 """ Return the number of a month by (locale-independently) English name """
1839
1840 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1841
1842 try:
1843 return month_names.index(name) + 1
1844 except ValueError:
1845 return None
1846
1847
1848 def month_by_abbreviation(abbrev):
1849 """ Return the number of a month by (locale-independently) English
1850 abbreviations """
1851
1852 try:
1853 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1854 except ValueError:
1855 return None
1856
1857
1858 def fix_xml_ampersands(xml_str):
1859 """Replace all the '&' by '&amp;' in XML"""
1860 return re.sub(
1861 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1862 '&amp;',
1863 xml_str)
1864
1865
1866 def setproctitle(title):
1867 assert isinstance(title, str)
1868
1869 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1870 try:
1871 import ctypes
1872 except ImportError:
1873 return
1874
1875 try:
1876 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1877 except OSError:
1878 return
1879 except TypeError:
1880 # LoadLibrary in Windows Python 2.7.13 only expects
1881 # a bytestring, but since unicode_literals turns
1882 # every string into a unicode string, it fails.
1883 return
1884 title_bytes = title.encode()
1885 buf = ctypes.create_string_buffer(len(title_bytes))
1886 buf.value = title_bytes
1887 try:
1888 # PR_SET_NAME = 15 Ref: /usr/include/linux/prctl.h
1889 libc.prctl(15, buf, 0, 0, 0)
1890 except AttributeError:
1891 return # Strange libc, just skip this
1892
1893
1894 def remove_start(s, start):
1895 return s[len(start):] if s is not None and s.startswith(start) else s
1896
1897
1898 def remove_end(s, end):
1899 return s[:-len(end)] if s is not None and s.endswith(end) else s
1900
1901
1902 def remove_quotes(s):
1903 if s is None or len(s) < 2:
1904 return s
1905 for quote in ('"', "'", ):
1906 if s[0] == quote and s[-1] == quote:
1907 return s[1:-1]
1908 return s
1909
1910
1911 def get_domain(url):
1912 """
1913 This implementation is inconsistent, but is kept for compatibility.
1914 Use this only for "webpage_url_domain"
1915 """
1916 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1917
1918
1919 def url_basename(url):
1920 path = urllib.parse.urlparse(url).path
1921 return path.strip('/').split('/')[-1]
1922
1923
1924 def base_url(url):
1925 return re.match(r'https?://[^?#]+/', url).group()
1926
1927
1928 def urljoin(base, path):
1929 if isinstance(path, bytes):
1930 path = path.decode()
1931 if not isinstance(path, str) or not path:
1932 return None
1933 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1934 return path
1935 if isinstance(base, bytes):
1936 base = base.decode()
1937 if not isinstance(base, str) or not re.match(
1938 r'^(?:https?:)?//', base):
1939 return None
1940 return urllib.parse.urljoin(base, path)
1941
1942
1943 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1944 if get_attr and v is not None:
1945 v = getattr(v, get_attr, None)
1946 try:
1947 return int(v) * invscale // scale
1948 except (ValueError, TypeError, OverflowError):
1949 return default
1950
1951
1952 def str_or_none(v, default=None):
1953 return default if v is None else str(v)
1954
1955
1956 def str_to_int(int_str):
1957 """ A more relaxed version of int_or_none """
1958 if isinstance(int_str, int):
1959 return int_str
1960 elif isinstance(int_str, str):
1961 int_str = re.sub(r'[,\.\+]', '', int_str)
1962 return int_or_none(int_str)
1963
1964
1965 def float_or_none(v, scale=1, invscale=1, default=None):
1966 if v is None:
1967 return default
1968 try:
1969 return float(v) * invscale / scale
1970 except (ValueError, TypeError):
1971 return default
1972
1973
1974 def bool_or_none(v, default=None):
1975 return v if isinstance(v, bool) else default
1976
1977
1978 def strip_or_none(v, default=None):
1979 return v.strip() if isinstance(v, str) else default
1980
1981
1982 def url_or_none(url):
1983 if not url or not isinstance(url, str):
1984 return None
1985 url = url.strip()
1986 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
1987
1988
1989 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
1990 datetime_object = None
1991 try:
1992 if isinstance(timestamp, (int, float)): # unix timestamp
1993 # Using naive datetime here can break timestamp() in Windows
1994 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
1995 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
1996 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
1997 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
1998 + datetime.timedelta(seconds=timestamp))
1999 elif isinstance(timestamp, str): # assume YYYYMMDD
2000 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2001 date_format = re.sub( # Support %s on windows
2002 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2003 return datetime_object.strftime(date_format)
2004 except (ValueError, TypeError, AttributeError):
2005 return default
2006
2007
2008 def parse_duration(s):
2009 if not isinstance(s, str):
2010 return None
2011 s = s.strip()
2012 if not s:
2013 return None
2014
2015 days, hours, mins, secs, ms = [None] * 5
2016 m = re.match(r'''(?x)
2017 (?P<before_secs>
2018 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2019 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2020 (?P<ms>[.:][0-9]+)?Z?$
2021 ''', s)
2022 if m:
2023 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2024 else:
2025 m = re.match(
2026 r'''(?ix)(?:P?
2027 (?:
2028 [0-9]+\s*y(?:ears?)?,?\s*
2029 )?
2030 (?:
2031 [0-9]+\s*m(?:onths?)?,?\s*
2032 )?
2033 (?:
2034 [0-9]+\s*w(?:eeks?)?,?\s*
2035 )?
2036 (?:
2037 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2038 )?
2039 T)?
2040 (?:
2041 (?P<hours>[0-9]+)\s*h(?:(?:ou)?rs?)?,?\s*
2042 )?
2043 (?:
2044 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2045 )?
2046 (?:
2047 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2048 )?Z?$''', s)
2049 if m:
2050 days, hours, mins, secs, ms = m.groups()
2051 else:
2052 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2053 if m:
2054 hours, mins = m.groups()
2055 else:
2056 return None
2057
2058 if ms:
2059 ms = ms.replace(':', '.')
2060 return sum(float(part or 0) * mult for part, mult in (
2061 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2062
2063
2064 def prepend_extension(filename, ext, expected_real_ext=None):
2065 name, real_ext = os.path.splitext(filename)
2066 return (
2067 f'{name}.{ext}{real_ext}'
2068 if not expected_real_ext or real_ext[1:] == expected_real_ext
2069 else f'{filename}.{ext}')
2070
2071
2072 def replace_extension(filename, ext, expected_real_ext=None):
2073 name, real_ext = os.path.splitext(filename)
2074 return '{}.{}'.format(
2075 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2076 ext)
2077
2078
2079 def check_executable(exe, args=[]):
2080 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2081 args can be a list of arguments for a short output (like -version) """
2082 try:
2083 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2084 except OSError:
2085 return False
2086 return exe
2087
2088
2089 def _get_exe_version_output(exe, args):
2090 try:
2091 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2092 # SIGTTOU if yt-dlp is run in the background.
2093 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2094 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2095 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2096 if ret:
2097 return None
2098 except OSError:
2099 return False
2100 return stdout
2101
2102
2103 def detect_exe_version(output, version_re=None, unrecognized='present'):
2104 assert isinstance(output, str)
2105 if version_re is None:
2106 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2107 m = re.search(version_re, output)
2108 if m:
2109 return m.group(1)
2110 else:
2111 return unrecognized
2112
2113
2114 def get_exe_version(exe, args=['--version'],
2115 version_re=None, unrecognized=('present', 'broken')):
2116 """ Returns the version of the specified executable,
2117 or False if the executable is not present """
2118 unrecognized = variadic(unrecognized)
2119 assert len(unrecognized) in (1, 2)
2120 out = _get_exe_version_output(exe, args)
2121 if out is None:
2122 return unrecognized[-1]
2123 return out and detect_exe_version(out, version_re, unrecognized[0])
2124
2125
2126 def frange(start=0, stop=None, step=1):
2127 """Float range"""
2128 if stop is None:
2129 start, stop = 0, start
2130 sign = [-1, 1][step > 0] if step else 0
2131 while sign * start < sign * stop:
2132 yield start
2133 start += step
2134
2135
2136 class LazyList(collections.abc.Sequence):
2137 """Lazy immutable list from an iterable
2138 Note that slices of a LazyList are lists and not LazyList"""
2139
2140 class IndexError(IndexError):
2141 pass
2142
2143 def __init__(self, iterable, *, reverse=False, _cache=None):
2144 self._iterable = iter(iterable)
2145 self._cache = [] if _cache is None else _cache
2146 self._reversed = reverse
2147
2148 def __iter__(self):
2149 if self._reversed:
2150 # We need to consume the entire iterable to iterate in reverse
2151 yield from self.exhaust()
2152 return
2153 yield from self._cache
2154 for item in self._iterable:
2155 self._cache.append(item)
2156 yield item
2157
2158 def _exhaust(self):
2159 self._cache.extend(self._iterable)
2160 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2161 return self._cache
2162
2163 def exhaust(self):
2164 """Evaluate the entire iterable"""
2165 return self._exhaust()[::-1 if self._reversed else 1]
2166
2167 @staticmethod
2168 def _reverse_index(x):
2169 return None if x is None else ~x
2170
2171 def __getitem__(self, idx):
2172 if isinstance(idx, slice):
2173 if self._reversed:
2174 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2175 start, stop, step = idx.start, idx.stop, idx.step or 1
2176 elif isinstance(idx, int):
2177 if self._reversed:
2178 idx = self._reverse_index(idx)
2179 start, stop, step = idx, idx, 0
2180 else:
2181 raise TypeError('indices must be integers or slices')
2182 if ((start or 0) < 0 or (stop or 0) < 0
2183 or (start is None and step < 0)
2184 or (stop is None and step > 0)):
2185 # We need to consume the entire iterable to be able to slice from the end
2186 # Obviously, never use this with infinite iterables
2187 self._exhaust()
2188 try:
2189 return self._cache[idx]
2190 except IndexError as e:
2191 raise self.IndexError(e) from e
2192 n = max(start or 0, stop or 0) - len(self._cache) + 1
2193 if n > 0:
2194 self._cache.extend(itertools.islice(self._iterable, n))
2195 try:
2196 return self._cache[idx]
2197 except IndexError as e:
2198 raise self.IndexError(e) from e
2199
2200 def __bool__(self):
2201 try:
2202 self[-1] if self._reversed else self[0]
2203 except self.IndexError:
2204 return False
2205 return True
2206
2207 def __len__(self):
2208 self._exhaust()
2209 return len(self._cache)
2210
2211 def __reversed__(self):
2212 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2213
2214 def __copy__(self):
2215 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2216
2217 def __repr__(self):
2218 # repr and str should mimic a list. So we exhaust the iterable
2219 return repr(self.exhaust())
2220
2221 def __str__(self):
2222 return repr(self.exhaust())
2223
2224
2225 class PagedList:
2226
2227 class IndexError(IndexError):
2228 pass
2229
2230 def __len__(self):
2231 # This is only useful for tests
2232 return len(self.getslice())
2233
2234 def __init__(self, pagefunc, pagesize, use_cache=True):
2235 self._pagefunc = pagefunc
2236 self._pagesize = pagesize
2237 self._pagecount = float('inf')
2238 self._use_cache = use_cache
2239 self._cache = {}
2240
2241 def getpage(self, pagenum):
2242 page_results = self._cache.get(pagenum)
2243 if page_results is None:
2244 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2245 if self._use_cache:
2246 self._cache[pagenum] = page_results
2247 return page_results
2248
2249 def getslice(self, start=0, end=None):
2250 return list(self._getslice(start, end))
2251
2252 def _getslice(self, start, end):
2253 raise NotImplementedError('This method must be implemented by subclasses')
2254
2255 def __getitem__(self, idx):
2256 assert self._use_cache, 'Indexing PagedList requires cache'
2257 if not isinstance(idx, int) or idx < 0:
2258 raise TypeError('indices must be non-negative integers')
2259 entries = self.getslice(idx, idx + 1)
2260 if not entries:
2261 raise self.IndexError()
2262 return entries[0]
2263
2264 def __bool__(self):
2265 return bool(self.getslice(0, 1))
2266
2267
2268 class OnDemandPagedList(PagedList):
2269 """Download pages until a page with less than maximum results"""
2270
2271 def _getslice(self, start, end):
2272 for pagenum in itertools.count(start // self._pagesize):
2273 firstid = pagenum * self._pagesize
2274 nextfirstid = pagenum * self._pagesize + self._pagesize
2275 if start >= nextfirstid:
2276 continue
2277
2278 startv = (
2279 start % self._pagesize
2280 if firstid <= start < nextfirstid
2281 else 0)
2282 endv = (
2283 ((end - 1) % self._pagesize) + 1
2284 if (end is not None and firstid <= end <= nextfirstid)
2285 else None)
2286
2287 try:
2288 page_results = self.getpage(pagenum)
2289 except Exception:
2290 self._pagecount = pagenum - 1
2291 raise
2292 if startv != 0 or endv is not None:
2293 page_results = page_results[startv:endv]
2294 yield from page_results
2295
2296 # A little optimization - if current page is not "full", ie. does
2297 # not contain page_size videos then we can assume that this page
2298 # is the last one - there are no more ids on further pages -
2299 # i.e. no need to query again.
2300 if len(page_results) + startv < self._pagesize:
2301 break
2302
2303 # If we got the whole page, but the next page is not interesting,
2304 # break out early as well
2305 if end == nextfirstid:
2306 break
2307
2308
2309 class InAdvancePagedList(PagedList):
2310 """PagedList with total number of pages known in advance"""
2311
2312 def __init__(self, pagefunc, pagecount, pagesize):
2313 PagedList.__init__(self, pagefunc, pagesize, True)
2314 self._pagecount = pagecount
2315
2316 def _getslice(self, start, end):
2317 start_page = start // self._pagesize
2318 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2319 skip_elems = start - start_page * self._pagesize
2320 only_more = None if end is None else end - start
2321 for pagenum in range(start_page, end_page):
2322 page_results = self.getpage(pagenum)
2323 if skip_elems:
2324 page_results = page_results[skip_elems:]
2325 skip_elems = None
2326 if only_more is not None:
2327 if len(page_results) < only_more:
2328 only_more -= len(page_results)
2329 else:
2330 yield from page_results[:only_more]
2331 break
2332 yield from page_results
2333
2334
2335 class PlaylistEntries:
2336 MissingEntry = object()
2337 is_exhausted = False
2338
2339 def __init__(self, ydl, info_dict):
2340 self.ydl = ydl
2341
2342 # _entries must be assigned now since infodict can change during iteration
2343 entries = info_dict.get('entries')
2344 if entries is None:
2345 raise EntryNotInPlaylist('There are no entries')
2346 elif isinstance(entries, list):
2347 self.is_exhausted = True
2348
2349 requested_entries = info_dict.get('requested_entries')
2350 self.is_incomplete = requested_entries is not None
2351 if self.is_incomplete:
2352 assert self.is_exhausted
2353 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2354 for i, entry in zip(requested_entries, entries):
2355 self._entries[i - 1] = entry
2356 elif isinstance(entries, (list, PagedList, LazyList)):
2357 self._entries = entries
2358 else:
2359 self._entries = LazyList(entries)
2360
2361 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2362 (?P<start>[+-]?\d+)?
2363 (?P<range>[:-]
2364 (?P<end>[+-]?\d+|inf(?:inite)?)?
2365 (?::(?P<step>[+-]?\d+))?
2366 )?''')
2367
2368 @classmethod
2369 def parse_playlist_items(cls, string):
2370 for segment in string.split(','):
2371 if not segment:
2372 raise ValueError('There is two or more consecutive commas')
2373 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2374 if not mobj:
2375 raise ValueError(f'{segment!r} is not a valid specification')
2376 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2377 if int_or_none(step) == 0:
2378 raise ValueError(f'Step in {segment!r} cannot be zero')
2379 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2380
2381 def get_requested_items(self):
2382 playlist_items = self.ydl.params.get('playlist_items')
2383 playlist_start = self.ydl.params.get('playliststart', 1)
2384 playlist_end = self.ydl.params.get('playlistend')
2385 # For backwards compatibility, interpret -1 as whole list
2386 if playlist_end in (-1, None):
2387 playlist_end = ''
2388 if not playlist_items:
2389 playlist_items = f'{playlist_start}:{playlist_end}'
2390 elif playlist_start != 1 or playlist_end:
2391 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2392
2393 for index in self.parse_playlist_items(playlist_items):
2394 for i, entry in self[index]:
2395 yield i, entry
2396 if not entry:
2397 continue
2398 try:
2399 # The item may have just been added to archive. Don't break due to it
2400 if not self.ydl.params.get('lazy_playlist'):
2401 # TODO: Add auto-generated fields
2402 self.ydl._match_entry(entry, incomplete=True, silent=True)
2403 except (ExistingVideoReached, RejectedVideoReached):
2404 return
2405
2406 def get_full_count(self):
2407 if self.is_exhausted and not self.is_incomplete:
2408 return len(self)
2409 elif isinstance(self._entries, InAdvancePagedList):
2410 if self._entries._pagesize == 1:
2411 return self._entries._pagecount
2412
2413 @functools.cached_property
2414 def _getter(self):
2415 if isinstance(self._entries, list):
2416 def get_entry(i):
2417 try:
2418 entry = self._entries[i]
2419 except IndexError:
2420 entry = self.MissingEntry
2421 if not self.is_incomplete:
2422 raise self.IndexError()
2423 if entry is self.MissingEntry:
2424 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2425 return entry
2426 else:
2427 def get_entry(i):
2428 try:
2429 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2430 except (LazyList.IndexError, PagedList.IndexError):
2431 raise self.IndexError()
2432 return get_entry
2433
2434 def __getitem__(self, idx):
2435 if isinstance(idx, int):
2436 idx = slice(idx, idx)
2437
2438 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2439 step = 1 if idx.step is None else idx.step
2440 if idx.start is None:
2441 start = 0 if step > 0 else len(self) - 1
2442 else:
2443 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2444
2445 # NB: Do not call len(self) when idx == [:]
2446 if idx.stop is None:
2447 stop = 0 if step < 0 else float('inf')
2448 else:
2449 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2450 stop += [-1, 1][step > 0]
2451
2452 for i in frange(start, stop, step):
2453 if i < 0:
2454 continue
2455 try:
2456 entry = self._getter(i)
2457 except self.IndexError:
2458 self.is_exhausted = True
2459 if step > 0:
2460 break
2461 continue
2462 yield i + 1, entry
2463
2464 def __len__(self):
2465 return len(tuple(self[:]))
2466
2467 class IndexError(IndexError):
2468 pass
2469
2470
2471 def uppercase_escape(s):
2472 unicode_escape = codecs.getdecoder('unicode_escape')
2473 return re.sub(
2474 r'\\U[0-9a-fA-F]{8}',
2475 lambda m: unicode_escape(m.group(0))[0],
2476 s)
2477
2478
2479 def lowercase_escape(s):
2480 unicode_escape = codecs.getdecoder('unicode_escape')
2481 return re.sub(
2482 r'\\u[0-9a-fA-F]{4}',
2483 lambda m: unicode_escape(m.group(0))[0],
2484 s)
2485
2486
2487 def parse_qs(url, **kwargs):
2488 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2489
2490
2491 def read_batch_urls(batch_fd):
2492 def fixup(url):
2493 if not isinstance(url, str):
2494 url = url.decode('utf-8', 'replace')
2495 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2496 for bom in BOM_UTF8:
2497 if url.startswith(bom):
2498 url = url[len(bom):]
2499 url = url.lstrip()
2500 if not url or url.startswith(('#', ';', ']')):
2501 return False
2502 # "#" cannot be stripped out since it is part of the URI
2503 # However, it can be safely stripped out if following a whitespace
2504 return re.split(r'\s#', url, 1)[0].rstrip()
2505
2506 with contextlib.closing(batch_fd) as fd:
2507 return [url for url in map(fixup, fd) if url]
2508
2509
2510 def urlencode_postdata(*args, **kargs):
2511 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2512
2513
2514 def update_url(url, *, query_update=None, **kwargs):
2515 """Replace URL components specified by kwargs
2516 @param url str or parse url tuple
2517 @param query_update update query
2518 @returns str
2519 """
2520 if isinstance(url, str):
2521 if not kwargs and not query_update:
2522 return url
2523 else:
2524 url = urllib.parse.urlparse(url)
2525 if query_update:
2526 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2527 kwargs['query'] = urllib.parse.urlencode({
2528 **urllib.parse.parse_qs(url.query),
2529 **query_update
2530 }, True)
2531 return urllib.parse.urlunparse(url._replace(**kwargs))
2532
2533
2534 def update_url_query(url, query):
2535 return update_url(url, query_update=query)
2536
2537
2538 def _multipart_encode_impl(data, boundary):
2539 content_type = 'multipart/form-data; boundary=%s' % boundary
2540
2541 out = b''
2542 for k, v in data.items():
2543 out += b'--' + boundary.encode('ascii') + b'\r\n'
2544 if isinstance(k, str):
2545 k = k.encode()
2546 if isinstance(v, str):
2547 v = v.encode()
2548 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2549 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2550 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2551 if boundary.encode('ascii') in content:
2552 raise ValueError('Boundary overlaps with data')
2553 out += content
2554
2555 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2556
2557 return out, content_type
2558
2559
2560 def multipart_encode(data, boundary=None):
2561 '''
2562 Encode a dict to RFC 7578-compliant form-data
2563
2564 data:
2565 A dict where keys and values can be either Unicode or bytes-like
2566 objects.
2567 boundary:
2568 If specified a Unicode object, it's used as the boundary. Otherwise
2569 a random boundary is generated.
2570
2571 Reference: https://tools.ietf.org/html/rfc7578
2572 '''
2573 has_specified_boundary = boundary is not None
2574
2575 while True:
2576 if boundary is None:
2577 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2578
2579 try:
2580 out, content_type = _multipart_encode_impl(data, boundary)
2581 break
2582 except ValueError:
2583 if has_specified_boundary:
2584 raise
2585 boundary = None
2586
2587 return out, content_type
2588
2589
2590 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2591 if blocked_types is NO_DEFAULT:
2592 blocked_types = (str, bytes, collections.abc.Mapping)
2593 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2594
2595
2596 def variadic(x, allowed_types=NO_DEFAULT):
2597 if not isinstance(allowed_types, (tuple, type)):
2598 deprecation_warning('allowed_types should be a tuple or a type')
2599 allowed_types = tuple(allowed_types)
2600 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2601
2602
2603 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2604 for f in funcs:
2605 try:
2606 val = f(*args, **kwargs)
2607 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2608 pass
2609 else:
2610 if expected_type is None or isinstance(val, expected_type):
2611 return val
2612
2613
2614 def try_get(src, getter, expected_type=None):
2615 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2616
2617
2618 def filter_dict(dct, cndn=lambda _, v: v is not None):
2619 return {k: v for k, v in dct.items() if cndn(k, v)}
2620
2621
2622 def merge_dicts(*dicts):
2623 merged = {}
2624 for a_dict in dicts:
2625 for k, v in a_dict.items():
2626 if (v is not None and k not in merged
2627 or isinstance(v, str) and merged[k] == ''):
2628 merged[k] = v
2629 return merged
2630
2631
2632 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2633 return string if isinstance(string, str) else str(string, encoding, errors)
2634
2635
2636 US_RATINGS = {
2637 'G': 0,
2638 'PG': 10,
2639 'PG-13': 13,
2640 'R': 16,
2641 'NC': 18,
2642 }
2643
2644
2645 TV_PARENTAL_GUIDELINES = {
2646 'TV-Y': 0,
2647 'TV-Y7': 7,
2648 'TV-G': 0,
2649 'TV-PG': 0,
2650 'TV-14': 14,
2651 'TV-MA': 17,
2652 }
2653
2654
2655 def parse_age_limit(s):
2656 # isinstance(False, int) is True. So type() must be used instead
2657 if type(s) is int: # noqa: E721
2658 return s if 0 <= s <= 21 else None
2659 elif not isinstance(s, str):
2660 return None
2661 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2662 if m:
2663 return int(m.group('age'))
2664 s = s.upper()
2665 if s in US_RATINGS:
2666 return US_RATINGS[s]
2667 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2668 if m:
2669 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2670 return None
2671
2672
2673 def strip_jsonp(code):
2674 return re.sub(
2675 r'''(?sx)^
2676 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2677 (?:\s*&&\s*(?P=func_name))?
2678 \s*\(\s*(?P<callback_data>.*)\);?
2679 \s*?(?://[^\n]*)*$''',
2680 r'\g<callback_data>', code)
2681
2682
2683 def js_to_json(code, vars={}, *, strict=False):
2684 # vars is a dict of var, val pairs to substitute
2685 STRING_QUOTES = '\'"`'
2686 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2687 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2688 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2689 INTEGER_TABLE = (
2690 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2691 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2692 )
2693
2694 def process_escape(match):
2695 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2696 escape = match.group(1) or match.group(2)
2697
2698 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2699 else R'\u00' if escape == 'x'
2700 else '' if escape == '\n'
2701 else escape)
2702
2703 def template_substitute(match):
2704 evaluated = js_to_json(match.group(1), vars, strict=strict)
2705 if evaluated[0] == '"':
2706 return json.loads(evaluated)
2707 return evaluated
2708
2709 def fix_kv(m):
2710 v = m.group(0)
2711 if v in ('true', 'false', 'null'):
2712 return v
2713 elif v in ('undefined', 'void 0'):
2714 return 'null'
2715 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2716 return ''
2717
2718 if v[0] in STRING_QUOTES:
2719 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2720 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2721 return f'"{escaped}"'
2722
2723 for regex, base in INTEGER_TABLE:
2724 im = re.match(regex, v)
2725 if im:
2726 i = int(im.group(1), base)
2727 return f'"{i}":' if v.endswith(':') else str(i)
2728
2729 if v in vars:
2730 try:
2731 if not strict:
2732 json.loads(vars[v])
2733 except json.JSONDecodeError:
2734 return json.dumps(vars[v])
2735 else:
2736 return vars[v]
2737
2738 if not strict:
2739 return f'"{v}"'
2740
2741 raise ValueError(f'Unknown value: {v}')
2742
2743 def create_map(mobj):
2744 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2745
2746 code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code)
2747 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2748 if not strict:
2749 code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code)
2750 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2751 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2752 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2753
2754 return re.sub(rf'''(?sx)
2755 {STRING_RE}|
2756 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2757 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2758 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2759 [0-9]+(?={SKIP_RE}:)|
2760 !+
2761 ''', fix_kv, code)
2762
2763
2764 def qualities(quality_ids):
2765 """ Get a numeric quality value out of a list of possible values """
2766 def q(qid):
2767 try:
2768 return quality_ids.index(qid)
2769 except ValueError:
2770 return -1
2771 return q
2772
2773
2774 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2775
2776
2777 DEFAULT_OUTTMPL = {
2778 'default': '%(title)s [%(id)s].%(ext)s',
2779 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2780 }
2781 OUTTMPL_TYPES = {
2782 'chapter': None,
2783 'subtitle': None,
2784 'thumbnail': None,
2785 'description': 'description',
2786 'annotation': 'annotations.xml',
2787 'infojson': 'info.json',
2788 'link': None,
2789 'pl_video': None,
2790 'pl_thumbnail': None,
2791 'pl_description': 'description',
2792 'pl_infojson': 'info.json',
2793 }
2794
2795 # As of [1] format syntax is:
2796 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2797 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2798 STR_FORMAT_RE_TMPL = r'''(?x)
2799 (?<!%)(?P<prefix>(?:%%)*)
2800 %
2801 (?P<has_key>\((?P<key>{0})\))?
2802 (?P<format>
2803 (?P<conversion>[#0\-+ ]+)?
2804 (?P<min_width>\d+)?
2805 (?P<precision>\.\d+)?
2806 (?P<len_mod>[hlL])? # unused in python
2807 {1} # conversion type
2808 )
2809 '''
2810
2811
2812 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2813
2814
2815 def limit_length(s, length):
2816 """ Add ellipses to overly long strings """
2817 if s is None:
2818 return None
2819 ELLIPSES = '...'
2820 if len(s) > length:
2821 return s[:length - len(ELLIPSES)] + ELLIPSES
2822 return s
2823
2824
2825 def version_tuple(v):
2826 return tuple(int(e) for e in re.split(r'[-.]', v))
2827
2828
2829 def is_outdated_version(version, limit, assume_new=True):
2830 if not version:
2831 return not assume_new
2832 try:
2833 return version_tuple(version) < version_tuple(limit)
2834 except ValueError:
2835 return not assume_new
2836
2837
2838 def ytdl_is_updateable():
2839 """ Returns if yt-dlp can be updated with -U """
2840
2841 from ..update import is_non_updateable
2842
2843 return not is_non_updateable()
2844
2845
2846 def args_to_str(args):
2847 # Get a short string representation for a subprocess command
2848 return ' '.join(compat_shlex_quote(a) for a in args)
2849
2850
2851 def error_to_str(err):
2852 return f'{type(err).__name__}: {err}'
2853
2854
2855 def mimetype2ext(mt, default=NO_DEFAULT):
2856 if not isinstance(mt, str):
2857 if default is not NO_DEFAULT:
2858 return default
2859 return None
2860
2861 MAP = {
2862 # video
2863 '3gpp': '3gp',
2864 'mp2t': 'ts',
2865 'mp4': 'mp4',
2866 'mpeg': 'mpeg',
2867 'mpegurl': 'm3u8',
2868 'quicktime': 'mov',
2869 'webm': 'webm',
2870 'vp9': 'vp9',
2871 'video/ogg': 'ogv',
2872 'x-flv': 'flv',
2873 'x-m4v': 'm4v',
2874 'x-matroska': 'mkv',
2875 'x-mng': 'mng',
2876 'x-mp4-fragmented': 'mp4',
2877 'x-ms-asf': 'asf',
2878 'x-ms-wmv': 'wmv',
2879 'x-msvideo': 'avi',
2880
2881 # application (streaming playlists)
2882 'dash+xml': 'mpd',
2883 'f4m+xml': 'f4m',
2884 'hds+xml': 'f4m',
2885 'vnd.apple.mpegurl': 'm3u8',
2886 'vnd.ms-sstr+xml': 'ism',
2887 'x-mpegurl': 'm3u8',
2888
2889 # audio
2890 'audio/mp4': 'm4a',
2891 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2892 # Using .mp3 as it's the most popular one
2893 'audio/mpeg': 'mp3',
2894 'audio/webm': 'webm',
2895 'audio/x-matroska': 'mka',
2896 'audio/x-mpegurl': 'm3u',
2897 'midi': 'mid',
2898 'ogg': 'ogg',
2899 'wav': 'wav',
2900 'wave': 'wav',
2901 'x-aac': 'aac',
2902 'x-flac': 'flac',
2903 'x-m4a': 'm4a',
2904 'x-realaudio': 'ra',
2905 'x-wav': 'wav',
2906
2907 # image
2908 'avif': 'avif',
2909 'bmp': 'bmp',
2910 'gif': 'gif',
2911 'jpeg': 'jpg',
2912 'png': 'png',
2913 'svg+xml': 'svg',
2914 'tiff': 'tif',
2915 'vnd.wap.wbmp': 'wbmp',
2916 'webp': 'webp',
2917 'x-icon': 'ico',
2918 'x-jng': 'jng',
2919 'x-ms-bmp': 'bmp',
2920
2921 # caption
2922 'filmstrip+json': 'fs',
2923 'smptett+xml': 'tt',
2924 'ttaf+xml': 'dfxp',
2925 'ttml+xml': 'ttml',
2926 'x-ms-sami': 'sami',
2927
2928 # misc
2929 'gzip': 'gz',
2930 'json': 'json',
2931 'xml': 'xml',
2932 'zip': 'zip',
2933 }
2934
2935 mimetype = mt.partition(';')[0].strip().lower()
2936 _, _, subtype = mimetype.rpartition('/')
2937
2938 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2939 if ext:
2940 return ext
2941 elif default is not NO_DEFAULT:
2942 return default
2943 return subtype.replace('+', '.')
2944
2945
2946 def ext2mimetype(ext_or_url):
2947 if not ext_or_url:
2948 return None
2949 if '.' not in ext_or_url:
2950 ext_or_url = f'file.{ext_or_url}'
2951 return mimetypes.guess_type(ext_or_url)[0]
2952
2953
2954 def parse_codecs(codecs_str):
2955 # http://tools.ietf.org/html/rfc6381
2956 if not codecs_str:
2957 return {}
2958 split_codecs = list(filter(None, map(
2959 str.strip, codecs_str.strip().strip(',').split(','))))
2960 vcodec, acodec, scodec, hdr = None, None, None, None
2961 for full_codec in split_codecs:
2962 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
2963 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
2964 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
2965 if vcodec:
2966 continue
2967 vcodec = full_codec
2968 if parts[0] in ('dvh1', 'dvhe'):
2969 hdr = 'DV'
2970 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
2971 hdr = 'HDR10'
2972 elif parts[:2] == ['vp9', '2']:
2973 hdr = 'HDR10'
2974 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
2975 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
2976 acodec = acodec or full_codec
2977 elif parts[0] in ('stpp', 'wvtt'):
2978 scodec = scodec or full_codec
2979 else:
2980 write_string(f'WARNING: Unknown codec {full_codec}\n')
2981 if vcodec or acodec or scodec:
2982 return {
2983 'vcodec': vcodec or 'none',
2984 'acodec': acodec or 'none',
2985 'dynamic_range': hdr,
2986 **({'scodec': scodec} if scodec is not None else {}),
2987 }
2988 elif len(split_codecs) == 2:
2989 return {
2990 'vcodec': split_codecs[0],
2991 'acodec': split_codecs[1],
2992 }
2993 return {}
2994
2995
2996 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
2997 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
2998
2999 allow_mkv = not preferences or 'mkv' in preferences
3000
3001 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3002 return 'mkv' # TODO: any other format allows this?
3003
3004 # TODO: All codecs supported by parse_codecs isn't handled here
3005 COMPATIBLE_CODECS = {
3006 'mp4': {
3007 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3008 'h264', 'aacl', 'ec-3', # Set in ISM
3009 },
3010 'webm': {
3011 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3012 'vp9x', 'vp8x', # in the webm spec
3013 },
3014 }
3015
3016 sanitize_codec = functools.partial(
3017 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3018 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3019
3020 for ext in preferences or COMPATIBLE_CODECS.keys():
3021 codec_set = COMPATIBLE_CODECS.get(ext, set())
3022 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3023 return ext
3024
3025 COMPATIBLE_EXTS = (
3026 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3027 {'webm', 'weba'},
3028 )
3029 for ext in preferences or vexts:
3030 current_exts = {ext, *vexts, *aexts}
3031 if ext == 'mkv' or current_exts == {ext} or any(
3032 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3033 return ext
3034 return 'mkv' if allow_mkv else preferences[-1]
3035
3036
3037 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3038 getheader = url_handle.headers.get
3039
3040 cd = getheader('Content-Disposition')
3041 if cd:
3042 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3043 if m:
3044 e = determine_ext(m.group('filename'), default_ext=None)
3045 if e:
3046 return e
3047
3048 meta_ext = getheader('x-amz-meta-name')
3049 if meta_ext:
3050 e = meta_ext.rpartition('.')[2]
3051 if e:
3052 return e
3053
3054 return mimetype2ext(getheader('Content-Type'), default=default)
3055
3056
3057 def encode_data_uri(data, mime_type):
3058 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3059
3060
3061 def age_restricted(content_limit, age_limit):
3062 """ Returns True iff the content should be blocked """
3063
3064 if age_limit is None: # No limit set
3065 return False
3066 if content_limit is None:
3067 return False # Content available for everyone
3068 return age_limit < content_limit
3069
3070
3071 # List of known byte-order-marks (BOM)
3072 BOMS = [
3073 (b'\xef\xbb\xbf', 'utf-8'),
3074 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3075 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3076 (b'\xff\xfe', 'utf-16-le'),
3077 (b'\xfe\xff', 'utf-16-be'),
3078 ]
3079
3080
3081 def is_html(first_bytes):
3082 """ Detect whether a file contains HTML by examining its first bytes. """
3083
3084 encoding = 'utf-8'
3085 for bom, enc in BOMS:
3086 while first_bytes.startswith(bom):
3087 encoding, first_bytes = enc, first_bytes[len(bom):]
3088
3089 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3090
3091
3092 def determine_protocol(info_dict):
3093 protocol = info_dict.get('protocol')
3094 if protocol is not None:
3095 return protocol
3096
3097 url = sanitize_url(info_dict['url'])
3098 if url.startswith('rtmp'):
3099 return 'rtmp'
3100 elif url.startswith('mms'):
3101 return 'mms'
3102 elif url.startswith('rtsp'):
3103 return 'rtsp'
3104
3105 ext = determine_ext(url)
3106 if ext == 'm3u8':
3107 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3108 elif ext == 'f4m':
3109 return 'f4m'
3110
3111 return urllib.parse.urlparse(url).scheme
3112
3113
3114 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3115 """ Render a list of rows, each as a list of values.
3116 Text after a \t will be right aligned """
3117 def width(string):
3118 return len(remove_terminal_sequences(string).replace('\t', ''))
3119
3120 def get_max_lens(table):
3121 return [max(width(str(v)) for v in col) for col in zip(*table)]
3122
3123 def filter_using_list(row, filterArray):
3124 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3125
3126 max_lens = get_max_lens(data) if hide_empty else []
3127 header_row = filter_using_list(header_row, max_lens)
3128 data = [filter_using_list(row, max_lens) for row in data]
3129
3130 table = [header_row] + data
3131 max_lens = get_max_lens(table)
3132 extra_gap += 1
3133 if delim:
3134 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3135 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3136 for row in table:
3137 for pos, text in enumerate(map(str, row)):
3138 if '\t' in text:
3139 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3140 else:
3141 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3142 ret = '\n'.join(''.join(row).rstrip() for row in table)
3143 return ret
3144
3145
3146 def _match_one(filter_part, dct, incomplete):
3147 # TODO: Generalize code with YoutubeDL._build_format_filter
3148 STRING_OPERATORS = {
3149 '*=': operator.contains,
3150 '^=': lambda attr, value: attr.startswith(value),
3151 '$=': lambda attr, value: attr.endswith(value),
3152 '~=': lambda attr, value: re.search(value, attr),
3153 }
3154 COMPARISON_OPERATORS = {
3155 **STRING_OPERATORS,
3156 '<=': operator.le, # "<=" must be defined above "<"
3157 '<': operator.lt,
3158 '>=': operator.ge,
3159 '>': operator.gt,
3160 '=': operator.eq,
3161 }
3162
3163 if isinstance(incomplete, bool):
3164 is_incomplete = lambda _: incomplete
3165 else:
3166 is_incomplete = lambda k: k in incomplete
3167
3168 operator_rex = re.compile(r'''(?x)
3169 (?P<key>[a-z_]+)
3170 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3171 (?:
3172 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3173 (?P<strval>.+?)
3174 )
3175 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3176 m = operator_rex.fullmatch(filter_part.strip())
3177 if m:
3178 m = m.groupdict()
3179 unnegated_op = COMPARISON_OPERATORS[m['op']]
3180 if m['negation']:
3181 op = lambda attr, value: not unnegated_op(attr, value)
3182 else:
3183 op = unnegated_op
3184 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3185 if m['quote']:
3186 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3187 actual_value = dct.get(m['key'])
3188 numeric_comparison = None
3189 if isinstance(actual_value, (int, float)):
3190 # If the original field is a string and matching comparisonvalue is
3191 # a number we should respect the origin of the original field
3192 # and process comparison value as a string (see
3193 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3194 try:
3195 numeric_comparison = int(comparison_value)
3196 except ValueError:
3197 numeric_comparison = parse_filesize(comparison_value)
3198 if numeric_comparison is None:
3199 numeric_comparison = parse_filesize(f'{comparison_value}B')
3200 if numeric_comparison is None:
3201 numeric_comparison = parse_duration(comparison_value)
3202 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3203 raise ValueError('Operator %s only supports string values!' % m['op'])
3204 if actual_value is None:
3205 return is_incomplete(m['key']) or m['none_inclusive']
3206 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3207
3208 UNARY_OPERATORS = {
3209 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3210 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3211 }
3212 operator_rex = re.compile(r'''(?x)
3213 (?P<op>%s)\s*(?P<key>[a-z_]+)
3214 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3215 m = operator_rex.fullmatch(filter_part.strip())
3216 if m:
3217 op = UNARY_OPERATORS[m.group('op')]
3218 actual_value = dct.get(m.group('key'))
3219 if is_incomplete(m.group('key')) and actual_value is None:
3220 return True
3221 return op(actual_value)
3222
3223 raise ValueError('Invalid filter part %r' % filter_part)
3224
3225
3226 def match_str(filter_str, dct, incomplete=False):
3227 """ Filter a dictionary with a simple string syntax.
3228 @returns Whether the filter passes
3229 @param incomplete Set of keys that is expected to be missing from dct.
3230 Can be True/False to indicate all/none of the keys may be missing.
3231 All conditions on incomplete keys pass if the key is missing
3232 """
3233 return all(
3234 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3235 for filter_part in re.split(r'(?<!\\)&', filter_str))
3236
3237
3238 def match_filter_func(filters, breaking_filters=None):
3239 if not filters and not breaking_filters:
3240 return None
3241 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3242 filters = set(variadic(filters or []))
3243
3244 interactive = '-' in filters
3245 if interactive:
3246 filters.remove('-')
3247
3248 def _match_func(info_dict, incomplete=False):
3249 ret = breaking_filters(info_dict, incomplete)
3250 if ret is not None:
3251 raise RejectedVideoReached(ret)
3252
3253 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3254 return NO_DEFAULT if interactive and not incomplete else None
3255 else:
3256 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3257 filter_str = ') | ('.join(map(str.strip, filters))
3258 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3259 return _match_func
3260
3261
3262 class download_range_func:
3263 def __init__(self, chapters, ranges, from_info=False):
3264 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3265
3266 def __call__(self, info_dict, ydl):
3267
3268 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3269 else 'Cannot match chapters since chapter information is unavailable')
3270 for regex in self.chapters or []:
3271 for i, chapter in enumerate(info_dict.get('chapters') or []):
3272 if re.search(regex, chapter['title']):
3273 warning = None
3274 yield {**chapter, 'index': i}
3275 if self.chapters and warning:
3276 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3277
3278 for start, end in self.ranges or []:
3279 yield {
3280 'start_time': self._handle_negative_timestamp(start, info_dict),
3281 'end_time': self._handle_negative_timestamp(end, info_dict),
3282 }
3283
3284 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3285 yield {
3286 'start_time': info_dict.get('start_time') or 0,
3287 'end_time': info_dict.get('end_time') or float('inf'),
3288 }
3289 elif not self.ranges and not self.chapters:
3290 yield {}
3291
3292 @staticmethod
3293 def _handle_negative_timestamp(time, info):
3294 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3295
3296 def __eq__(self, other):
3297 return (isinstance(other, download_range_func)
3298 and self.chapters == other.chapters and self.ranges == other.ranges)
3299
3300 def __repr__(self):
3301 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3302
3303
3304 def parse_dfxp_time_expr(time_expr):
3305 if not time_expr:
3306 return
3307
3308 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3309 if mobj:
3310 return float(mobj.group('time_offset'))
3311
3312 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3313 if mobj:
3314 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3315
3316
3317 def srt_subtitles_timecode(seconds):
3318 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3319
3320
3321 def ass_subtitles_timecode(seconds):
3322 time = timetuple_from_msec(seconds * 1000)
3323 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3324
3325
3326 def dfxp2srt(dfxp_data):
3327 '''
3328 @param dfxp_data A bytes-like object containing DFXP data
3329 @returns A unicode object containing converted SRT data
3330 '''
3331 LEGACY_NAMESPACES = (
3332 (b'http://www.w3.org/ns/ttml', [
3333 b'http://www.w3.org/2004/11/ttaf1',
3334 b'http://www.w3.org/2006/04/ttaf1',
3335 b'http://www.w3.org/2006/10/ttaf1',
3336 ]),
3337 (b'http://www.w3.org/ns/ttml#styling', [
3338 b'http://www.w3.org/ns/ttml#style',
3339 ]),
3340 )
3341
3342 SUPPORTED_STYLING = [
3343 'color',
3344 'fontFamily',
3345 'fontSize',
3346 'fontStyle',
3347 'fontWeight',
3348 'textDecoration'
3349 ]
3350
3351 _x = functools.partial(xpath_with_ns, ns_map={
3352 'xml': 'http://www.w3.org/XML/1998/namespace',
3353 'ttml': 'http://www.w3.org/ns/ttml',
3354 'tts': 'http://www.w3.org/ns/ttml#styling',
3355 })
3356
3357 styles = {}
3358 default_style = {}
3359
3360 class TTMLPElementParser:
3361 _out = ''
3362 _unclosed_elements = []
3363 _applied_styles = []
3364
3365 def start(self, tag, attrib):
3366 if tag in (_x('ttml:br'), 'br'):
3367 self._out += '\n'
3368 else:
3369 unclosed_elements = []
3370 style = {}
3371 element_style_id = attrib.get('style')
3372 if default_style:
3373 style.update(default_style)
3374 if element_style_id:
3375 style.update(styles.get(element_style_id, {}))
3376 for prop in SUPPORTED_STYLING:
3377 prop_val = attrib.get(_x('tts:' + prop))
3378 if prop_val:
3379 style[prop] = prop_val
3380 if style:
3381 font = ''
3382 for k, v in sorted(style.items()):
3383 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3384 continue
3385 if k == 'color':
3386 font += ' color="%s"' % v
3387 elif k == 'fontSize':
3388 font += ' size="%s"' % v
3389 elif k == 'fontFamily':
3390 font += ' face="%s"' % v
3391 elif k == 'fontWeight' and v == 'bold':
3392 self._out += '<b>'
3393 unclosed_elements.append('b')
3394 elif k == 'fontStyle' and v == 'italic':
3395 self._out += '<i>'
3396 unclosed_elements.append('i')
3397 elif k == 'textDecoration' and v == 'underline':
3398 self._out += '<u>'
3399 unclosed_elements.append('u')
3400 if font:
3401 self._out += '<font' + font + '>'
3402 unclosed_elements.append('font')
3403 applied_style = {}
3404 if self._applied_styles:
3405 applied_style.update(self._applied_styles[-1])
3406 applied_style.update(style)
3407 self._applied_styles.append(applied_style)
3408 self._unclosed_elements.append(unclosed_elements)
3409
3410 def end(self, tag):
3411 if tag not in (_x('ttml:br'), 'br'):
3412 unclosed_elements = self._unclosed_elements.pop()
3413 for element in reversed(unclosed_elements):
3414 self._out += '</%s>' % element
3415 if unclosed_elements and self._applied_styles:
3416 self._applied_styles.pop()
3417
3418 def data(self, data):
3419 self._out += data
3420
3421 def close(self):
3422 return self._out.strip()
3423
3424 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3425 # This will not trigger false positives since only UTF-8 text is being replaced
3426 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3427
3428 def parse_node(node):
3429 target = TTMLPElementParser()
3430 parser = xml.etree.ElementTree.XMLParser(target=target)
3431 parser.feed(xml.etree.ElementTree.tostring(node))
3432 return parser.close()
3433
3434 for k, v in LEGACY_NAMESPACES:
3435 for ns in v:
3436 dfxp_data = dfxp_data.replace(ns, k)
3437
3438 dfxp = compat_etree_fromstring(dfxp_data)
3439 out = []
3440 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3441
3442 if not paras:
3443 raise ValueError('Invalid dfxp/TTML subtitle')
3444
3445 repeat = False
3446 while True:
3447 for style in dfxp.findall(_x('.//ttml:style')):
3448 style_id = style.get('id') or style.get(_x('xml:id'))
3449 if not style_id:
3450 continue
3451 parent_style_id = style.get('style')
3452 if parent_style_id:
3453 if parent_style_id not in styles:
3454 repeat = True
3455 continue
3456 styles[style_id] = styles[parent_style_id].copy()
3457 for prop in SUPPORTED_STYLING:
3458 prop_val = style.get(_x('tts:' + prop))
3459 if prop_val:
3460 styles.setdefault(style_id, {})[prop] = prop_val
3461 if repeat:
3462 repeat = False
3463 else:
3464 break
3465
3466 for p in ('body', 'div'):
3467 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3468 if ele is None:
3469 continue
3470 style = styles.get(ele.get('style'))
3471 if not style:
3472 continue
3473 default_style.update(style)
3474
3475 for para, index in zip(paras, itertools.count(1)):
3476 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3477 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3478 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3479 if begin_time is None:
3480 continue
3481 if not end_time:
3482 if not dur:
3483 continue
3484 end_time = begin_time + dur
3485 out.append('%d\n%s --> %s\n%s\n\n' % (
3486 index,
3487 srt_subtitles_timecode(begin_time),
3488 srt_subtitles_timecode(end_time),
3489 parse_node(para)))
3490
3491 return ''.join(out)
3492
3493
3494 def cli_option(params, command_option, param, separator=None):
3495 param = params.get(param)
3496 return ([] if param is None
3497 else [command_option, str(param)] if separator is None
3498 else [f'{command_option}{separator}{param}'])
3499
3500
3501 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3502 param = params.get(param)
3503 assert param in (True, False, None)
3504 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3505
3506
3507 def cli_valueless_option(params, command_option, param, expected_value=True):
3508 return [command_option] if params.get(param) == expected_value else []
3509
3510
3511 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3512 if isinstance(argdict, (list, tuple)): # for backward compatibility
3513 if use_compat:
3514 return argdict
3515 else:
3516 argdict = None
3517 if argdict is None:
3518 return default
3519 assert isinstance(argdict, dict)
3520
3521 assert isinstance(keys, (list, tuple))
3522 for key_list in keys:
3523 arg_list = list(filter(
3524 lambda x: x is not None,
3525 [argdict.get(key.lower()) for key in variadic(key_list)]))
3526 if arg_list:
3527 return [arg for args in arg_list for arg in args]
3528 return default
3529
3530
3531 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3532 main_key, exe = main_key.lower(), exe.lower()
3533 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3534 keys = [f'{root_key}{k}' for k in (keys or [''])]
3535 if root_key in keys:
3536 if main_key != exe:
3537 keys.append((main_key, exe))
3538 keys.append('default')
3539 else:
3540 use_compat = False
3541 return cli_configuration_args(argdict, keys, default, use_compat)
3542
3543
3544 class ISO639Utils:
3545 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3546 _lang_map = {
3547 'aa': 'aar',
3548 'ab': 'abk',
3549 'ae': 'ave',
3550 'af': 'afr',
3551 'ak': 'aka',
3552 'am': 'amh',
3553 'an': 'arg',
3554 'ar': 'ara',
3555 'as': 'asm',
3556 'av': 'ava',
3557 'ay': 'aym',
3558 'az': 'aze',
3559 'ba': 'bak',
3560 'be': 'bel',
3561 'bg': 'bul',
3562 'bh': 'bih',
3563 'bi': 'bis',
3564 'bm': 'bam',
3565 'bn': 'ben',
3566 'bo': 'bod',
3567 'br': 'bre',
3568 'bs': 'bos',
3569 'ca': 'cat',
3570 'ce': 'che',
3571 'ch': 'cha',
3572 'co': 'cos',
3573 'cr': 'cre',
3574 'cs': 'ces',
3575 'cu': 'chu',
3576 'cv': 'chv',
3577 'cy': 'cym',
3578 'da': 'dan',
3579 'de': 'deu',
3580 'dv': 'div',
3581 'dz': 'dzo',
3582 'ee': 'ewe',
3583 'el': 'ell',
3584 'en': 'eng',
3585 'eo': 'epo',
3586 'es': 'spa',
3587 'et': 'est',
3588 'eu': 'eus',
3589 'fa': 'fas',
3590 'ff': 'ful',
3591 'fi': 'fin',
3592 'fj': 'fij',
3593 'fo': 'fao',
3594 'fr': 'fra',
3595 'fy': 'fry',
3596 'ga': 'gle',
3597 'gd': 'gla',
3598 'gl': 'glg',
3599 'gn': 'grn',
3600 'gu': 'guj',
3601 'gv': 'glv',
3602 'ha': 'hau',
3603 'he': 'heb',
3604 'iw': 'heb', # Replaced by he in 1989 revision
3605 'hi': 'hin',
3606 'ho': 'hmo',
3607 'hr': 'hrv',
3608 'ht': 'hat',
3609 'hu': 'hun',
3610 'hy': 'hye',
3611 'hz': 'her',
3612 'ia': 'ina',
3613 'id': 'ind',
3614 'in': 'ind', # Replaced by id in 1989 revision
3615 'ie': 'ile',
3616 'ig': 'ibo',
3617 'ii': 'iii',
3618 'ik': 'ipk',
3619 'io': 'ido',
3620 'is': 'isl',
3621 'it': 'ita',
3622 'iu': 'iku',
3623 'ja': 'jpn',
3624 'jv': 'jav',
3625 'ka': 'kat',
3626 'kg': 'kon',
3627 'ki': 'kik',
3628 'kj': 'kua',
3629 'kk': 'kaz',
3630 'kl': 'kal',
3631 'km': 'khm',
3632 'kn': 'kan',
3633 'ko': 'kor',
3634 'kr': 'kau',
3635 'ks': 'kas',
3636 'ku': 'kur',
3637 'kv': 'kom',
3638 'kw': 'cor',
3639 'ky': 'kir',
3640 'la': 'lat',
3641 'lb': 'ltz',
3642 'lg': 'lug',
3643 'li': 'lim',
3644 'ln': 'lin',
3645 'lo': 'lao',
3646 'lt': 'lit',
3647 'lu': 'lub',
3648 'lv': 'lav',
3649 'mg': 'mlg',
3650 'mh': 'mah',
3651 'mi': 'mri',
3652 'mk': 'mkd',
3653 'ml': 'mal',
3654 'mn': 'mon',
3655 'mr': 'mar',
3656 'ms': 'msa',
3657 'mt': 'mlt',
3658 'my': 'mya',
3659 'na': 'nau',
3660 'nb': 'nob',
3661 'nd': 'nde',
3662 'ne': 'nep',
3663 'ng': 'ndo',
3664 'nl': 'nld',
3665 'nn': 'nno',
3666 'no': 'nor',
3667 'nr': 'nbl',
3668 'nv': 'nav',
3669 'ny': 'nya',
3670 'oc': 'oci',
3671 'oj': 'oji',
3672 'om': 'orm',
3673 'or': 'ori',
3674 'os': 'oss',
3675 'pa': 'pan',
3676 'pe': 'per',
3677 'pi': 'pli',
3678 'pl': 'pol',
3679 'ps': 'pus',
3680 'pt': 'por',
3681 'qu': 'que',
3682 'rm': 'roh',
3683 'rn': 'run',
3684 'ro': 'ron',
3685 'ru': 'rus',
3686 'rw': 'kin',
3687 'sa': 'san',
3688 'sc': 'srd',
3689 'sd': 'snd',
3690 'se': 'sme',
3691 'sg': 'sag',
3692 'si': 'sin',
3693 'sk': 'slk',
3694 'sl': 'slv',
3695 'sm': 'smo',
3696 'sn': 'sna',
3697 'so': 'som',
3698 'sq': 'sqi',
3699 'sr': 'srp',
3700 'ss': 'ssw',
3701 'st': 'sot',
3702 'su': 'sun',
3703 'sv': 'swe',
3704 'sw': 'swa',
3705 'ta': 'tam',
3706 'te': 'tel',
3707 'tg': 'tgk',
3708 'th': 'tha',
3709 'ti': 'tir',
3710 'tk': 'tuk',
3711 'tl': 'tgl',
3712 'tn': 'tsn',
3713 'to': 'ton',
3714 'tr': 'tur',
3715 'ts': 'tso',
3716 'tt': 'tat',
3717 'tw': 'twi',
3718 'ty': 'tah',
3719 'ug': 'uig',
3720 'uk': 'ukr',
3721 'ur': 'urd',
3722 'uz': 'uzb',
3723 've': 'ven',
3724 'vi': 'vie',
3725 'vo': 'vol',
3726 'wa': 'wln',
3727 'wo': 'wol',
3728 'xh': 'xho',
3729 'yi': 'yid',
3730 'ji': 'yid', # Replaced by yi in 1989 revision
3731 'yo': 'yor',
3732 'za': 'zha',
3733 'zh': 'zho',
3734 'zu': 'zul',
3735 }
3736
3737 @classmethod
3738 def short2long(cls, code):
3739 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3740 return cls._lang_map.get(code[:2])
3741
3742 @classmethod
3743 def long2short(cls, code):
3744 """Convert language code from ISO 639-2/T to ISO 639-1"""
3745 for short_name, long_name in cls._lang_map.items():
3746 if long_name == code:
3747 return short_name
3748
3749
3750 class ISO3166Utils:
3751 # From http://data.okfn.org/data/core/country-list
3752 _country_map = {
3753 'AF': 'Afghanistan',
3754 'AX': 'Åland Islands',
3755 'AL': 'Albania',
3756 'DZ': 'Algeria',
3757 'AS': 'American Samoa',
3758 'AD': 'Andorra',
3759 'AO': 'Angola',
3760 'AI': 'Anguilla',
3761 'AQ': 'Antarctica',
3762 'AG': 'Antigua and Barbuda',
3763 'AR': 'Argentina',
3764 'AM': 'Armenia',
3765 'AW': 'Aruba',
3766 'AU': 'Australia',
3767 'AT': 'Austria',
3768 'AZ': 'Azerbaijan',
3769 'BS': 'Bahamas',
3770 'BH': 'Bahrain',
3771 'BD': 'Bangladesh',
3772 'BB': 'Barbados',
3773 'BY': 'Belarus',
3774 'BE': 'Belgium',
3775 'BZ': 'Belize',
3776 'BJ': 'Benin',
3777 'BM': 'Bermuda',
3778 'BT': 'Bhutan',
3779 'BO': 'Bolivia, Plurinational State of',
3780 'BQ': 'Bonaire, Sint Eustatius and Saba',
3781 'BA': 'Bosnia and Herzegovina',
3782 'BW': 'Botswana',
3783 'BV': 'Bouvet Island',
3784 'BR': 'Brazil',
3785 'IO': 'British Indian Ocean Territory',
3786 'BN': 'Brunei Darussalam',
3787 'BG': 'Bulgaria',
3788 'BF': 'Burkina Faso',
3789 'BI': 'Burundi',
3790 'KH': 'Cambodia',
3791 'CM': 'Cameroon',
3792 'CA': 'Canada',
3793 'CV': 'Cape Verde',
3794 'KY': 'Cayman Islands',
3795 'CF': 'Central African Republic',
3796 'TD': 'Chad',
3797 'CL': 'Chile',
3798 'CN': 'China',
3799 'CX': 'Christmas Island',
3800 'CC': 'Cocos (Keeling) Islands',
3801 'CO': 'Colombia',
3802 'KM': 'Comoros',
3803 'CG': 'Congo',
3804 'CD': 'Congo, the Democratic Republic of the',
3805 'CK': 'Cook Islands',
3806 'CR': 'Costa Rica',
3807 'CI': 'Côte d\'Ivoire',
3808 'HR': 'Croatia',
3809 'CU': 'Cuba',
3810 'CW': 'Curaçao',
3811 'CY': 'Cyprus',
3812 'CZ': 'Czech Republic',
3813 'DK': 'Denmark',
3814 'DJ': 'Djibouti',
3815 'DM': 'Dominica',
3816 'DO': 'Dominican Republic',
3817 'EC': 'Ecuador',
3818 'EG': 'Egypt',
3819 'SV': 'El Salvador',
3820 'GQ': 'Equatorial Guinea',
3821 'ER': 'Eritrea',
3822 'EE': 'Estonia',
3823 'ET': 'Ethiopia',
3824 'FK': 'Falkland Islands (Malvinas)',
3825 'FO': 'Faroe Islands',
3826 'FJ': 'Fiji',
3827 'FI': 'Finland',
3828 'FR': 'France',
3829 'GF': 'French Guiana',
3830 'PF': 'French Polynesia',
3831 'TF': 'French Southern Territories',
3832 'GA': 'Gabon',
3833 'GM': 'Gambia',
3834 'GE': 'Georgia',
3835 'DE': 'Germany',
3836 'GH': 'Ghana',
3837 'GI': 'Gibraltar',
3838 'GR': 'Greece',
3839 'GL': 'Greenland',
3840 'GD': 'Grenada',
3841 'GP': 'Guadeloupe',
3842 'GU': 'Guam',
3843 'GT': 'Guatemala',
3844 'GG': 'Guernsey',
3845 'GN': 'Guinea',
3846 'GW': 'Guinea-Bissau',
3847 'GY': 'Guyana',
3848 'HT': 'Haiti',
3849 'HM': 'Heard Island and McDonald Islands',
3850 'VA': 'Holy See (Vatican City State)',
3851 'HN': 'Honduras',
3852 'HK': 'Hong Kong',
3853 'HU': 'Hungary',
3854 'IS': 'Iceland',
3855 'IN': 'India',
3856 'ID': 'Indonesia',
3857 'IR': 'Iran, Islamic Republic of',
3858 'IQ': 'Iraq',
3859 'IE': 'Ireland',
3860 'IM': 'Isle of Man',
3861 'IL': 'Israel',
3862 'IT': 'Italy',
3863 'JM': 'Jamaica',
3864 'JP': 'Japan',
3865 'JE': 'Jersey',
3866 'JO': 'Jordan',
3867 'KZ': 'Kazakhstan',
3868 'KE': 'Kenya',
3869 'KI': 'Kiribati',
3870 'KP': 'Korea, Democratic People\'s Republic of',
3871 'KR': 'Korea, Republic of',
3872 'KW': 'Kuwait',
3873 'KG': 'Kyrgyzstan',
3874 'LA': 'Lao People\'s Democratic Republic',
3875 'LV': 'Latvia',
3876 'LB': 'Lebanon',
3877 'LS': 'Lesotho',
3878 'LR': 'Liberia',
3879 'LY': 'Libya',
3880 'LI': 'Liechtenstein',
3881 'LT': 'Lithuania',
3882 'LU': 'Luxembourg',
3883 'MO': 'Macao',
3884 'MK': 'Macedonia, the Former Yugoslav Republic of',
3885 'MG': 'Madagascar',
3886 'MW': 'Malawi',
3887 'MY': 'Malaysia',
3888 'MV': 'Maldives',
3889 'ML': 'Mali',
3890 'MT': 'Malta',
3891 'MH': 'Marshall Islands',
3892 'MQ': 'Martinique',
3893 'MR': 'Mauritania',
3894 'MU': 'Mauritius',
3895 'YT': 'Mayotte',
3896 'MX': 'Mexico',
3897 'FM': 'Micronesia, Federated States of',
3898 'MD': 'Moldova, Republic of',
3899 'MC': 'Monaco',
3900 'MN': 'Mongolia',
3901 'ME': 'Montenegro',
3902 'MS': 'Montserrat',
3903 'MA': 'Morocco',
3904 'MZ': 'Mozambique',
3905 'MM': 'Myanmar',
3906 'NA': 'Namibia',
3907 'NR': 'Nauru',
3908 'NP': 'Nepal',
3909 'NL': 'Netherlands',
3910 'NC': 'New Caledonia',
3911 'NZ': 'New Zealand',
3912 'NI': 'Nicaragua',
3913 'NE': 'Niger',
3914 'NG': 'Nigeria',
3915 'NU': 'Niue',
3916 'NF': 'Norfolk Island',
3917 'MP': 'Northern Mariana Islands',
3918 'NO': 'Norway',
3919 'OM': 'Oman',
3920 'PK': 'Pakistan',
3921 'PW': 'Palau',
3922 'PS': 'Palestine, State of',
3923 'PA': 'Panama',
3924 'PG': 'Papua New Guinea',
3925 'PY': 'Paraguay',
3926 'PE': 'Peru',
3927 'PH': 'Philippines',
3928 'PN': 'Pitcairn',
3929 'PL': 'Poland',
3930 'PT': 'Portugal',
3931 'PR': 'Puerto Rico',
3932 'QA': 'Qatar',
3933 'RE': 'Réunion',
3934 'RO': 'Romania',
3935 'RU': 'Russian Federation',
3936 'RW': 'Rwanda',
3937 'BL': 'Saint Barthélemy',
3938 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3939 'KN': 'Saint Kitts and Nevis',
3940 'LC': 'Saint Lucia',
3941 'MF': 'Saint Martin (French part)',
3942 'PM': 'Saint Pierre and Miquelon',
3943 'VC': 'Saint Vincent and the Grenadines',
3944 'WS': 'Samoa',
3945 'SM': 'San Marino',
3946 'ST': 'Sao Tome and Principe',
3947 'SA': 'Saudi Arabia',
3948 'SN': 'Senegal',
3949 'RS': 'Serbia',
3950 'SC': 'Seychelles',
3951 'SL': 'Sierra Leone',
3952 'SG': 'Singapore',
3953 'SX': 'Sint Maarten (Dutch part)',
3954 'SK': 'Slovakia',
3955 'SI': 'Slovenia',
3956 'SB': 'Solomon Islands',
3957 'SO': 'Somalia',
3958 'ZA': 'South Africa',
3959 'GS': 'South Georgia and the South Sandwich Islands',
3960 'SS': 'South Sudan',
3961 'ES': 'Spain',
3962 'LK': 'Sri Lanka',
3963 'SD': 'Sudan',
3964 'SR': 'Suriname',
3965 'SJ': 'Svalbard and Jan Mayen',
3966 'SZ': 'Swaziland',
3967 'SE': 'Sweden',
3968 'CH': 'Switzerland',
3969 'SY': 'Syrian Arab Republic',
3970 'TW': 'Taiwan, Province of China',
3971 'TJ': 'Tajikistan',
3972 'TZ': 'Tanzania, United Republic of',
3973 'TH': 'Thailand',
3974 'TL': 'Timor-Leste',
3975 'TG': 'Togo',
3976 'TK': 'Tokelau',
3977 'TO': 'Tonga',
3978 'TT': 'Trinidad and Tobago',
3979 'TN': 'Tunisia',
3980 'TR': 'Turkey',
3981 'TM': 'Turkmenistan',
3982 'TC': 'Turks and Caicos Islands',
3983 'TV': 'Tuvalu',
3984 'UG': 'Uganda',
3985 'UA': 'Ukraine',
3986 'AE': 'United Arab Emirates',
3987 'GB': 'United Kingdom',
3988 'US': 'United States',
3989 'UM': 'United States Minor Outlying Islands',
3990 'UY': 'Uruguay',
3991 'UZ': 'Uzbekistan',
3992 'VU': 'Vanuatu',
3993 'VE': 'Venezuela, Bolivarian Republic of',
3994 'VN': 'Viet Nam',
3995 'VG': 'Virgin Islands, British',
3996 'VI': 'Virgin Islands, U.S.',
3997 'WF': 'Wallis and Futuna',
3998 'EH': 'Western Sahara',
3999 'YE': 'Yemen',
4000 'ZM': 'Zambia',
4001 'ZW': 'Zimbabwe',
4002 # Not ISO 3166 codes, but used for IP blocks
4003 'AP': 'Asia/Pacific Region',
4004 'EU': 'Europe',
4005 }
4006
4007 @classmethod
4008 def short2full(cls, code):
4009 """Convert an ISO 3166-2 country code to the corresponding full name"""
4010 return cls._country_map.get(code.upper())
4011
4012
4013 class GeoUtils:
4014 # Major IPv4 address blocks per country
4015 _country_ip_map = {
4016 'AD': '46.172.224.0/19',
4017 'AE': '94.200.0.0/13',
4018 'AF': '149.54.0.0/17',
4019 'AG': '209.59.64.0/18',
4020 'AI': '204.14.248.0/21',
4021 'AL': '46.99.0.0/16',
4022 'AM': '46.70.0.0/15',
4023 'AO': '105.168.0.0/13',
4024 'AP': '182.50.184.0/21',
4025 'AQ': '23.154.160.0/24',
4026 'AR': '181.0.0.0/12',
4027 'AS': '202.70.112.0/20',
4028 'AT': '77.116.0.0/14',
4029 'AU': '1.128.0.0/11',
4030 'AW': '181.41.0.0/18',
4031 'AX': '185.217.4.0/22',
4032 'AZ': '5.197.0.0/16',
4033 'BA': '31.176.128.0/17',
4034 'BB': '65.48.128.0/17',
4035 'BD': '114.130.0.0/16',
4036 'BE': '57.0.0.0/8',
4037 'BF': '102.178.0.0/15',
4038 'BG': '95.42.0.0/15',
4039 'BH': '37.131.0.0/17',
4040 'BI': '154.117.192.0/18',
4041 'BJ': '137.255.0.0/16',
4042 'BL': '185.212.72.0/23',
4043 'BM': '196.12.64.0/18',
4044 'BN': '156.31.0.0/16',
4045 'BO': '161.56.0.0/16',
4046 'BQ': '161.0.80.0/20',
4047 'BR': '191.128.0.0/12',
4048 'BS': '24.51.64.0/18',
4049 'BT': '119.2.96.0/19',
4050 'BW': '168.167.0.0/16',
4051 'BY': '178.120.0.0/13',
4052 'BZ': '179.42.192.0/18',
4053 'CA': '99.224.0.0/11',
4054 'CD': '41.243.0.0/16',
4055 'CF': '197.242.176.0/21',
4056 'CG': '160.113.0.0/16',
4057 'CH': '85.0.0.0/13',
4058 'CI': '102.136.0.0/14',
4059 'CK': '202.65.32.0/19',
4060 'CL': '152.172.0.0/14',
4061 'CM': '102.244.0.0/14',
4062 'CN': '36.128.0.0/10',
4063 'CO': '181.240.0.0/12',
4064 'CR': '201.192.0.0/12',
4065 'CU': '152.206.0.0/15',
4066 'CV': '165.90.96.0/19',
4067 'CW': '190.88.128.0/17',
4068 'CY': '31.153.0.0/16',
4069 'CZ': '88.100.0.0/14',
4070 'DE': '53.0.0.0/8',
4071 'DJ': '197.241.0.0/17',
4072 'DK': '87.48.0.0/12',
4073 'DM': '192.243.48.0/20',
4074 'DO': '152.166.0.0/15',
4075 'DZ': '41.96.0.0/12',
4076 'EC': '186.68.0.0/15',
4077 'EE': '90.190.0.0/15',
4078 'EG': '156.160.0.0/11',
4079 'ER': '196.200.96.0/20',
4080 'ES': '88.0.0.0/11',
4081 'ET': '196.188.0.0/14',
4082 'EU': '2.16.0.0/13',
4083 'FI': '91.152.0.0/13',
4084 'FJ': '144.120.0.0/16',
4085 'FK': '80.73.208.0/21',
4086 'FM': '119.252.112.0/20',
4087 'FO': '88.85.32.0/19',
4088 'FR': '90.0.0.0/9',
4089 'GA': '41.158.0.0/15',
4090 'GB': '25.0.0.0/8',
4091 'GD': '74.122.88.0/21',
4092 'GE': '31.146.0.0/16',
4093 'GF': '161.22.64.0/18',
4094 'GG': '62.68.160.0/19',
4095 'GH': '154.160.0.0/12',
4096 'GI': '95.164.0.0/16',
4097 'GL': '88.83.0.0/19',
4098 'GM': '160.182.0.0/15',
4099 'GN': '197.149.192.0/18',
4100 'GP': '104.250.0.0/19',
4101 'GQ': '105.235.224.0/20',
4102 'GR': '94.64.0.0/13',
4103 'GT': '168.234.0.0/16',
4104 'GU': '168.123.0.0/16',
4105 'GW': '197.214.80.0/20',
4106 'GY': '181.41.64.0/18',
4107 'HK': '113.252.0.0/14',
4108 'HN': '181.210.0.0/16',
4109 'HR': '93.136.0.0/13',
4110 'HT': '148.102.128.0/17',
4111 'HU': '84.0.0.0/14',
4112 'ID': '39.192.0.0/10',
4113 'IE': '87.32.0.0/12',
4114 'IL': '79.176.0.0/13',
4115 'IM': '5.62.80.0/20',
4116 'IN': '117.192.0.0/10',
4117 'IO': '203.83.48.0/21',
4118 'IQ': '37.236.0.0/14',
4119 'IR': '2.176.0.0/12',
4120 'IS': '82.221.0.0/16',
4121 'IT': '79.0.0.0/10',
4122 'JE': '87.244.64.0/18',
4123 'JM': '72.27.0.0/17',
4124 'JO': '176.29.0.0/16',
4125 'JP': '133.0.0.0/8',
4126 'KE': '105.48.0.0/12',
4127 'KG': '158.181.128.0/17',
4128 'KH': '36.37.128.0/17',
4129 'KI': '103.25.140.0/22',
4130 'KM': '197.255.224.0/20',
4131 'KN': '198.167.192.0/19',
4132 'KP': '175.45.176.0/22',
4133 'KR': '175.192.0.0/10',
4134 'KW': '37.36.0.0/14',
4135 'KY': '64.96.0.0/15',
4136 'KZ': '2.72.0.0/13',
4137 'LA': '115.84.64.0/18',
4138 'LB': '178.135.0.0/16',
4139 'LC': '24.92.144.0/20',
4140 'LI': '82.117.0.0/19',
4141 'LK': '112.134.0.0/15',
4142 'LR': '102.183.0.0/16',
4143 'LS': '129.232.0.0/17',
4144 'LT': '78.56.0.0/13',
4145 'LU': '188.42.0.0/16',
4146 'LV': '46.109.0.0/16',
4147 'LY': '41.252.0.0/14',
4148 'MA': '105.128.0.0/11',
4149 'MC': '88.209.64.0/18',
4150 'MD': '37.246.0.0/16',
4151 'ME': '178.175.0.0/17',
4152 'MF': '74.112.232.0/21',
4153 'MG': '154.126.0.0/17',
4154 'MH': '117.103.88.0/21',
4155 'MK': '77.28.0.0/15',
4156 'ML': '154.118.128.0/18',
4157 'MM': '37.111.0.0/17',
4158 'MN': '49.0.128.0/17',
4159 'MO': '60.246.0.0/16',
4160 'MP': '202.88.64.0/20',
4161 'MQ': '109.203.224.0/19',
4162 'MR': '41.188.64.0/18',
4163 'MS': '208.90.112.0/22',
4164 'MT': '46.11.0.0/16',
4165 'MU': '105.16.0.0/12',
4166 'MV': '27.114.128.0/18',
4167 'MW': '102.70.0.0/15',
4168 'MX': '187.192.0.0/11',
4169 'MY': '175.136.0.0/13',
4170 'MZ': '197.218.0.0/15',
4171 'NA': '41.182.0.0/16',
4172 'NC': '101.101.0.0/18',
4173 'NE': '197.214.0.0/18',
4174 'NF': '203.17.240.0/22',
4175 'NG': '105.112.0.0/12',
4176 'NI': '186.76.0.0/15',
4177 'NL': '145.96.0.0/11',
4178 'NO': '84.208.0.0/13',
4179 'NP': '36.252.0.0/15',
4180 'NR': '203.98.224.0/19',
4181 'NU': '49.156.48.0/22',
4182 'NZ': '49.224.0.0/14',
4183 'OM': '5.36.0.0/15',
4184 'PA': '186.72.0.0/15',
4185 'PE': '186.160.0.0/14',
4186 'PF': '123.50.64.0/18',
4187 'PG': '124.240.192.0/19',
4188 'PH': '49.144.0.0/13',
4189 'PK': '39.32.0.0/11',
4190 'PL': '83.0.0.0/11',
4191 'PM': '70.36.0.0/20',
4192 'PR': '66.50.0.0/16',
4193 'PS': '188.161.0.0/16',
4194 'PT': '85.240.0.0/13',
4195 'PW': '202.124.224.0/20',
4196 'PY': '181.120.0.0/14',
4197 'QA': '37.210.0.0/15',
4198 'RE': '102.35.0.0/16',
4199 'RO': '79.112.0.0/13',
4200 'RS': '93.86.0.0/15',
4201 'RU': '5.136.0.0/13',
4202 'RW': '41.186.0.0/16',
4203 'SA': '188.48.0.0/13',
4204 'SB': '202.1.160.0/19',
4205 'SC': '154.192.0.0/11',
4206 'SD': '102.120.0.0/13',
4207 'SE': '78.64.0.0/12',
4208 'SG': '8.128.0.0/10',
4209 'SI': '188.196.0.0/14',
4210 'SK': '78.98.0.0/15',
4211 'SL': '102.143.0.0/17',
4212 'SM': '89.186.32.0/19',
4213 'SN': '41.82.0.0/15',
4214 'SO': '154.115.192.0/18',
4215 'SR': '186.179.128.0/17',
4216 'SS': '105.235.208.0/21',
4217 'ST': '197.159.160.0/19',
4218 'SV': '168.243.0.0/16',
4219 'SX': '190.102.0.0/20',
4220 'SY': '5.0.0.0/16',
4221 'SZ': '41.84.224.0/19',
4222 'TC': '65.255.48.0/20',
4223 'TD': '154.68.128.0/19',
4224 'TG': '196.168.0.0/14',
4225 'TH': '171.96.0.0/13',
4226 'TJ': '85.9.128.0/18',
4227 'TK': '27.96.24.0/21',
4228 'TL': '180.189.160.0/20',
4229 'TM': '95.85.96.0/19',
4230 'TN': '197.0.0.0/11',
4231 'TO': '175.176.144.0/21',
4232 'TR': '78.160.0.0/11',
4233 'TT': '186.44.0.0/15',
4234 'TV': '202.2.96.0/19',
4235 'TW': '120.96.0.0/11',
4236 'TZ': '156.156.0.0/14',
4237 'UA': '37.52.0.0/14',
4238 'UG': '102.80.0.0/13',
4239 'US': '6.0.0.0/8',
4240 'UY': '167.56.0.0/13',
4241 'UZ': '84.54.64.0/18',
4242 'VA': '212.77.0.0/19',
4243 'VC': '207.191.240.0/21',
4244 'VE': '186.88.0.0/13',
4245 'VG': '66.81.192.0/20',
4246 'VI': '146.226.0.0/16',
4247 'VN': '14.160.0.0/11',
4248 'VU': '202.80.32.0/20',
4249 'WF': '117.20.32.0/21',
4250 'WS': '202.4.32.0/19',
4251 'YE': '134.35.0.0/16',
4252 'YT': '41.242.116.0/22',
4253 'ZA': '41.0.0.0/11',
4254 'ZM': '102.144.0.0/13',
4255 'ZW': '102.177.192.0/18',
4256 }
4257
4258 @classmethod
4259 def random_ipv4(cls, code_or_block):
4260 if len(code_or_block) == 2:
4261 block = cls._country_ip_map.get(code_or_block.upper())
4262 if not block:
4263 return None
4264 else:
4265 block = code_or_block
4266 addr, preflen = block.split('/')
4267 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4268 addr_max = addr_min | (0xffffffff >> int(preflen))
4269 return str(socket.inet_ntoa(
4270 struct.pack('!L', random.randint(addr_min, addr_max))))
4271
4272
4273 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4274 # released into Public Domain
4275 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4276
4277 def long_to_bytes(n, blocksize=0):
4278 """long_to_bytes(n:long, blocksize:int) : string
4279 Convert a long integer to a byte string.
4280
4281 If optional blocksize is given and greater than zero, pad the front of the
4282 byte string with binary zeros so that the length is a multiple of
4283 blocksize.
4284 """
4285 # after much testing, this algorithm was deemed to be the fastest
4286 s = b''
4287 n = int(n)
4288 while n > 0:
4289 s = struct.pack('>I', n & 0xffffffff) + s
4290 n = n >> 32
4291 # strip off leading zeros
4292 for i in range(len(s)):
4293 if s[i] != b'\000'[0]:
4294 break
4295 else:
4296 # only happens when n == 0
4297 s = b'\000'
4298 i = 0
4299 s = s[i:]
4300 # add back some pad bytes. this could be done more efficiently w.r.t. the
4301 # de-padding being done above, but sigh...
4302 if blocksize > 0 and len(s) % blocksize:
4303 s = (blocksize - len(s) % blocksize) * b'\000' + s
4304 return s
4305
4306
4307 def bytes_to_long(s):
4308 """bytes_to_long(string) : long
4309 Convert a byte string to a long integer.
4310
4311 This is (essentially) the inverse of long_to_bytes().
4312 """
4313 acc = 0
4314 length = len(s)
4315 if length % 4:
4316 extra = (4 - length % 4)
4317 s = b'\000' * extra + s
4318 length = length + extra
4319 for i in range(0, length, 4):
4320 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4321 return acc
4322
4323
4324 def ohdave_rsa_encrypt(data, exponent, modulus):
4325 '''
4326 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4327
4328 Input:
4329 data: data to encrypt, bytes-like object
4330 exponent, modulus: parameter e and N of RSA algorithm, both integer
4331 Output: hex string of encrypted data
4332
4333 Limitation: supports one block encryption only
4334 '''
4335
4336 payload = int(binascii.hexlify(data[::-1]), 16)
4337 encrypted = pow(payload, exponent, modulus)
4338 return '%x' % encrypted
4339
4340
4341 def pkcs1pad(data, length):
4342 """
4343 Padding input data with PKCS#1 scheme
4344
4345 @param {int[]} data input data
4346 @param {int} length target length
4347 @returns {int[]} padded data
4348 """
4349 if len(data) > length - 11:
4350 raise ValueError('Input data too long for PKCS#1 padding')
4351
4352 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4353 return [0, 2] + pseudo_random + [0] + data
4354
4355
4356 def _base_n_table(n, table):
4357 if not table and not n:
4358 raise ValueError('Either table or n must be specified')
4359 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4360
4361 if n and n != len(table):
4362 raise ValueError(f'base {n} exceeds table length {len(table)}')
4363 return table
4364
4365
4366 def encode_base_n(num, n=None, table=None):
4367 """Convert given int to a base-n string"""
4368 table = _base_n_table(n, table)
4369 if not num:
4370 return table[0]
4371
4372 result, base = '', len(table)
4373 while num:
4374 result = table[num % base] + result
4375 num = num // base
4376 return result
4377
4378
4379 def decode_base_n(string, n=None, table=None):
4380 """Convert given base-n string to int"""
4381 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4382 result, base = 0, len(table)
4383 for char in string:
4384 result = result * base + table[char]
4385 return result
4386
4387
4388 def decode_packed_codes(code):
4389 mobj = re.search(PACKED_CODES_RE, code)
4390 obfuscated_code, base, count, symbols = mobj.groups()
4391 base = int(base)
4392 count = int(count)
4393 symbols = symbols.split('|')
4394 symbol_table = {}
4395
4396 while count:
4397 count -= 1
4398 base_n_count = encode_base_n(count, base)
4399 symbol_table[base_n_count] = symbols[count] or base_n_count
4400
4401 return re.sub(
4402 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4403 obfuscated_code)
4404
4405
4406 def caesar(s, alphabet, shift):
4407 if shift == 0:
4408 return s
4409 l = len(alphabet)
4410 return ''.join(
4411 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4412 for c in s)
4413
4414
4415 def rot47(s):
4416 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4417
4418
4419 def parse_m3u8_attributes(attrib):
4420 info = {}
4421 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4422 if val.startswith('"'):
4423 val = val[1:-1]
4424 info[key] = val
4425 return info
4426
4427
4428 def urshift(val, n):
4429 return val >> n if val >= 0 else (val + 0x100000000) >> n
4430
4431
4432 def write_xattr(path, key, value):
4433 # Windows: Write xattrs to NTFS Alternate Data Streams:
4434 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4435 if compat_os_name == 'nt':
4436 assert ':' not in key
4437 assert os.path.exists(path)
4438
4439 try:
4440 with open(f'{path}:{key}', 'wb') as f:
4441 f.write(value)
4442 except OSError as e:
4443 raise XAttrMetadataError(e.errno, e.strerror)
4444 return
4445
4446 # UNIX Method 1. Use os.setxattr/xattrs/pyxattrs modules
4447
4448 setxattr = None
4449 if callable(getattr(os, 'setxattr', None)):
4450 setxattr = os.setxattr
4451 elif getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4452 # Unicode arguments are not supported in pyxattr until version 0.5.0
4453 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4454 if version_tuple(xattr.__version__) >= (0, 5, 0):
4455 setxattr = xattr.set
4456 elif xattr:
4457 setxattr = xattr.setxattr
4458
4459 if setxattr:
4460 try:
4461 setxattr(path, key, value)
4462 except OSError as e:
4463 raise XAttrMetadataError(e.errno, e.strerror)
4464 return
4465
4466 # UNIX Method 2. Use setfattr/xattr executables
4467 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4468 else 'xattr' if check_executable('xattr', ['-h']) else None)
4469 if not exe:
4470 raise XAttrUnavailableError(
4471 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4472 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4473
4474 value = value.decode()
4475 try:
4476 _, stderr, returncode = Popen.run(
4477 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4478 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4479 except OSError as e:
4480 raise XAttrMetadataError(e.errno, e.strerror)
4481 if returncode:
4482 raise XAttrMetadataError(returncode, stderr)
4483
4484
4485 def random_birthday(year_field, month_field, day_field):
4486 start_date = datetime.date(1950, 1, 1)
4487 end_date = datetime.date(1995, 12, 31)
4488 offset = random.randint(0, (end_date - start_date).days)
4489 random_date = start_date + datetime.timedelta(offset)
4490 return {
4491 year_field: str(random_date.year),
4492 month_field: str(random_date.month),
4493 day_field: str(random_date.day),
4494 }
4495
4496
4497 def find_available_port(interface=''):
4498 try:
4499 with socket.socket() as sock:
4500 sock.bind((interface, 0))
4501 return sock.getsockname()[1]
4502 except OSError:
4503 return None
4504
4505
4506 # Templates for internet shortcut files, which are plain text files.
4507 DOT_URL_LINK_TEMPLATE = '''\
4508 [InternetShortcut]
4509 URL=%(url)s
4510 '''
4511
4512 DOT_WEBLOC_LINK_TEMPLATE = '''\
4513 <?xml version="1.0" encoding="UTF-8"?>
4514 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4515 <plist version="1.0">
4516 <dict>
4517 \t<key>URL</key>
4518 \t<string>%(url)s</string>
4519 </dict>
4520 </plist>
4521 '''
4522
4523 DOT_DESKTOP_LINK_TEMPLATE = '''\
4524 [Desktop Entry]
4525 Encoding=UTF-8
4526 Name=%(filename)s
4527 Type=Link
4528 URL=%(url)s
4529 Icon=text-html
4530 '''
4531
4532 LINK_TEMPLATES = {
4533 'url': DOT_URL_LINK_TEMPLATE,
4534 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4535 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4536 }
4537
4538
4539 def iri_to_uri(iri):
4540 """
4541 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4542
4543 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4544 """
4545
4546 iri_parts = urllib.parse.urlparse(iri)
4547
4548 if '[' in iri_parts.netloc:
4549 raise ValueError('IPv6 URIs are not, yet, supported.')
4550 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4551
4552 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4553
4554 net_location = ''
4555 if iri_parts.username:
4556 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4557 if iri_parts.password is not None:
4558 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4559 net_location += '@'
4560
4561 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
4562 # The 'idna' encoding produces ASCII text.
4563 if iri_parts.port is not None and iri_parts.port != 80:
4564 net_location += ':' + str(iri_parts.port)
4565
4566 return urllib.parse.urlunparse(
4567 (iri_parts.scheme,
4568 net_location,
4569
4570 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4571
4572 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4573 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4574
4575 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4576 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4577
4578 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4579
4580 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4581
4582
4583 def to_high_limit_path(path):
4584 if sys.platform in ['win32', 'cygwin']:
4585 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4586 return '\\\\?\\' + os.path.abspath(path)
4587
4588 return path
4589
4590
4591 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4592 val = traversal.traverse_obj(obj, *variadic(field))
4593 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4594 return default
4595 return template % func(val)
4596
4597
4598 def clean_podcast_url(url):
4599 url = re.sub(r'''(?x)
4600 (?:
4601 (?:
4602 chtbl\.com/track|
4603 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4604 play\.podtrac\.com|
4605 chrt\.fm/track|
4606 mgln\.ai/e
4607 )(?:/[^/.]+)?|
4608 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4609 flex\.acast\.com|
4610 pd(?:
4611 cn\.co| # https://podcorn.com/analytics-prefix/
4612 st\.fm # https://podsights.com/docs/
4613 )/e|
4614 [0-9]\.gum\.fm|
4615 pscrb\.fm/rss/p
4616 )/''', '', url)
4617 return re.sub(r'^\w+://(\w+://)', r'\1', url)
4618
4619
4620 _HEX_TABLE = '0123456789abcdef'
4621
4622
4623 def random_uuidv4():
4624 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4625
4626
4627 def make_dir(path, to_screen=None):
4628 try:
4629 dn = os.path.dirname(path)
4630 if dn:
4631 os.makedirs(dn, exist_ok=True)
4632 return True
4633 except OSError as err:
4634 if callable(to_screen) is not None:
4635 to_screen(f'unable to create directory {err}')
4636 return False
4637
4638
4639 def get_executable_path():
4640 from ..update import _get_variant_and_executable_path
4641
4642 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4643
4644
4645 def get_user_config_dirs(package_name):
4646 # .config (e.g. ~/.config/package_name)
4647 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4648 yield os.path.join(xdg_config_home, package_name)
4649
4650 # appdata (%APPDATA%/package_name)
4651 appdata_dir = os.getenv('appdata')
4652 if appdata_dir:
4653 yield os.path.join(appdata_dir, package_name)
4654
4655 # home (~/.package_name)
4656 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4657
4658
4659 def get_system_config_dirs(package_name):
4660 # /etc/package_name
4661 yield os.path.join('/etc', package_name)
4662
4663
4664 def time_seconds(**kwargs):
4665 """
4666 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4667 """
4668 return time.time() + datetime.timedelta(**kwargs).total_seconds()
4669
4670
4671 # create a JSON Web Signature (jws) with HS256 algorithm
4672 # the resulting format is in JWS Compact Serialization
4673 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4674 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4675 def jwt_encode_hs256(payload_data, key, headers={}):
4676 header_data = {
4677 'alg': 'HS256',
4678 'typ': 'JWT',
4679 }
4680 if headers:
4681 header_data.update(headers)
4682 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4683 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4684 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4685 signature_b64 = base64.b64encode(h.digest())
4686 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4687 return token
4688
4689
4690 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4691 def jwt_decode_hs256(jwt):
4692 header_b64, payload_b64, signature_b64 = jwt.split('.')
4693 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4694 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4695 return payload_data
4696
4697
4698 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4699
4700
4701 @functools.cache
4702 def supports_terminal_sequences(stream):
4703 if compat_os_name == 'nt':
4704 if not WINDOWS_VT_MODE:
4705 return False
4706 elif not os.getenv('TERM'):
4707 return False
4708 try:
4709 return stream.isatty()
4710 except BaseException:
4711 return False
4712
4713
4714 def windows_enable_vt_mode():
4715 """Ref: https://bugs.python.org/issue30075 """
4716 if get_windows_version() < (10, 0, 10586):
4717 return
4718
4719 import ctypes
4720 import ctypes.wintypes
4721 import msvcrt
4722
4723 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4724
4725 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4726 handle = os.open('CONOUT$', os.O_RDWR)
4727 try:
4728 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4729 dw_original_mode = ctypes.wintypes.DWORD()
4730 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4731 if not success:
4732 raise Exception('GetConsoleMode failed')
4733
4734 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4735 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4736 if not success:
4737 raise Exception('SetConsoleMode failed')
4738 finally:
4739 os.close(handle)
4740
4741 global WINDOWS_VT_MODE
4742 WINDOWS_VT_MODE = True
4743 supports_terminal_sequences.cache_clear()
4744
4745
4746 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4747
4748
4749 def remove_terminal_sequences(string):
4750 return _terminal_sequences_re.sub('', string)
4751
4752
4753 def number_of_digits(number):
4754 return len('%d' % number)
4755
4756
4757 def join_nonempty(*values, delim='-', from_dict=None):
4758 if from_dict is not None:
4759 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4760 return delim.join(map(str, filter(None, values)))
4761
4762
4763 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4764 """
4765 Find the largest format dimensions in terms of video width and, for each thumbnail:
4766 * Modify the URL: Match the width with the provided regex and replace with the former width
4767 * Update dimensions
4768
4769 This function is useful with video services that scale the provided thumbnails on demand
4770 """
4771 _keys = ('width', 'height')
4772 max_dimensions = max(
4773 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4774 default=(0, 0))
4775 if not max_dimensions[0]:
4776 return thumbnails
4777 return [
4778 merge_dicts(
4779 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4780 dict(zip(_keys, max_dimensions)), thumbnail)
4781 for thumbnail in thumbnails
4782 ]
4783
4784
4785 def parse_http_range(range):
4786 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4787 if not range:
4788 return None, None, None
4789 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4790 if not crg:
4791 return None, None, None
4792 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4793
4794
4795 def read_stdin(what):
4796 if what:
4797 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4798 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4799 return sys.stdin
4800
4801
4802 def determine_file_encoding(data):
4803 """
4804 Detect the text encoding used
4805 @returns (encoding, bytes to skip)
4806 """
4807
4808 # BOM marks are given priority over declarations
4809 for bom, enc in BOMS:
4810 if data.startswith(bom):
4811 return enc, len(bom)
4812
4813 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4814 # We ignore the endianness to get a good enough match
4815 data = data.replace(b'\0', b'')
4816 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4817 return mobj.group(1).decode() if mobj else None, 0
4818
4819
4820 class Config:
4821 own_args = None
4822 parsed_args = None
4823 filename = None
4824 __initialized = False
4825
4826 def __init__(self, parser, label=None):
4827 self.parser, self.label = parser, label
4828 self._loaded_paths, self.configs = set(), []
4829
4830 def init(self, args=None, filename=None):
4831 assert not self.__initialized
4832 self.own_args, self.filename = args, filename
4833 return self.load_configs()
4834
4835 def load_configs(self):
4836 directory = ''
4837 if self.filename:
4838 location = os.path.realpath(self.filename)
4839 directory = os.path.dirname(location)
4840 if location in self._loaded_paths:
4841 return False
4842 self._loaded_paths.add(location)
4843
4844 self.__initialized = True
4845 opts, _ = self.parser.parse_known_args(self.own_args)
4846 self.parsed_args = self.own_args
4847 for location in opts.config_locations or []:
4848 if location == '-':
4849 if location in self._loaded_paths:
4850 continue
4851 self._loaded_paths.add(location)
4852 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4853 continue
4854 location = os.path.join(directory, expand_path(location))
4855 if os.path.isdir(location):
4856 location = os.path.join(location, 'yt-dlp.conf')
4857 if not os.path.exists(location):
4858 self.parser.error(f'config location {location} does not exist')
4859 self.append_config(self.read_file(location), location)
4860 return True
4861
4862 def __str__(self):
4863 label = join_nonempty(
4864 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4865 delim=' ')
4866 return join_nonempty(
4867 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4868 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4869 delim='\n')
4870
4871 @staticmethod
4872 def read_file(filename, default=[]):
4873 try:
4874 optionf = open(filename, 'rb')
4875 except OSError:
4876 return default # silently skip if file is not present
4877 try:
4878 enc, skip = determine_file_encoding(optionf.read(512))
4879 optionf.seek(skip, io.SEEK_SET)
4880 except OSError:
4881 enc = None # silently skip read errors
4882 try:
4883 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4884 contents = optionf.read().decode(enc or preferredencoding())
4885 res = shlex.split(contents, comments=True)
4886 except Exception as err:
4887 raise ValueError(f'Unable to parse "{filename}": {err}')
4888 finally:
4889 optionf.close()
4890 return res
4891
4892 @staticmethod
4893 def hide_login_info(opts):
4894 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4895 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4896
4897 def _scrub_eq(o):
4898 m = eqre.match(o)
4899 if m:
4900 return m.group('key') + '=PRIVATE'
4901 else:
4902 return o
4903
4904 opts = list(map(_scrub_eq, opts))
4905 for idx, opt in enumerate(opts):
4906 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4907 opts[idx + 1] = 'PRIVATE'
4908 return opts
4909
4910 def append_config(self, *args, label=None):
4911 config = type(self)(self.parser, label)
4912 config._loaded_paths = self._loaded_paths
4913 if config.init(*args):
4914 self.configs.append(config)
4915
4916 @property
4917 def all_args(self):
4918 for config in reversed(self.configs):
4919 yield from config.all_args
4920 yield from self.parsed_args or []
4921
4922 def parse_known_args(self, **kwargs):
4923 return self.parser.parse_known_args(self.all_args, **kwargs)
4924
4925 def parse_args(self):
4926 return self.parser.parse_args(self.all_args)
4927
4928
4929 def merge_headers(*dicts):
4930 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
4931 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
4932
4933
4934 def cached_method(f):
4935 """Cache a method"""
4936 signature = inspect.signature(f)
4937
4938 @functools.wraps(f)
4939 def wrapper(self, *args, **kwargs):
4940 bound_args = signature.bind(self, *args, **kwargs)
4941 bound_args.apply_defaults()
4942 key = tuple(bound_args.arguments.values())[1:]
4943
4944 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
4945 if key not in cache:
4946 cache[key] = f(self, *args, **kwargs)
4947 return cache[key]
4948 return wrapper
4949
4950
4951 class classproperty:
4952 """property access for class methods with optional caching"""
4953 def __new__(cls, func=None, *args, **kwargs):
4954 if not func:
4955 return functools.partial(cls, *args, **kwargs)
4956 return super().__new__(cls)
4957
4958 def __init__(self, func, *, cache=False):
4959 functools.update_wrapper(self, func)
4960 self.func = func
4961 self._cache = {} if cache else None
4962
4963 def __get__(self, _, cls):
4964 if self._cache is None:
4965 return self.func(cls)
4966 elif cls not in self._cache:
4967 self._cache[cls] = self.func(cls)
4968 return self._cache[cls]
4969
4970
4971 class function_with_repr:
4972 def __init__(self, func, repr_=None):
4973 functools.update_wrapper(self, func)
4974 self.func, self.__repr = func, repr_
4975
4976 def __call__(self, *args, **kwargs):
4977 return self.func(*args, **kwargs)
4978
4979 def __repr__(self):
4980 if self.__repr:
4981 return self.__repr
4982 return f'{self.func.__module__}.{self.func.__qualname__}'
4983
4984
4985 class Namespace(types.SimpleNamespace):
4986 """Immutable namespace"""
4987
4988 def __iter__(self):
4989 return iter(self.__dict__.values())
4990
4991 @property
4992 def items_(self):
4993 return self.__dict__.items()
4994
4995
4996 MEDIA_EXTENSIONS = Namespace(
4997 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
4998 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
4999 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5000 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5001 thumbnails=('jpg', 'png', 'webp'),
5002 storyboards=('mhtml', ),
5003 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5004 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5005 )
5006 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5007 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5008
5009 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5010
5011
5012 class RetryManager:
5013 """Usage:
5014 for retry in RetryManager(...):
5015 try:
5016 ...
5017 except SomeException as err:
5018 retry.error = err
5019 continue
5020 """
5021 attempt, _error = 0, None
5022
5023 def __init__(self, _retries, _error_callback, **kwargs):
5024 self.retries = _retries or 0
5025 self.error_callback = functools.partial(_error_callback, **kwargs)
5026
5027 def _should_retry(self):
5028 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5029
5030 @property
5031 def error(self):
5032 if self._error is NO_DEFAULT:
5033 return None
5034 return self._error
5035
5036 @error.setter
5037 def error(self, value):
5038 self._error = value
5039
5040 def __iter__(self):
5041 while self._should_retry():
5042 self.error = NO_DEFAULT
5043 self.attempt += 1
5044 yield self
5045 if self.error:
5046 self.error_callback(self.error, self.attempt, self.retries)
5047
5048 @staticmethod
5049 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5050 """Utility function for reporting retries"""
5051 if count > retries:
5052 if error:
5053 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5054 raise e
5055
5056 if not count:
5057 return warn(e)
5058 elif isinstance(e, ExtractorError):
5059 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5060 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5061
5062 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5063 if delay:
5064 info(f'Sleeping {delay:.2f} seconds ...')
5065 time.sleep(delay)
5066
5067
5068 def make_archive_id(ie, video_id):
5069 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5070 return f'{ie_key.lower()} {video_id}'
5071
5072
5073 def truncate_string(s, left, right=0):
5074 assert left > 3 and right >= 0
5075 if s is None or len(s) <= left + right:
5076 return s
5077 return f'{s[:left - 3]}...{s[-right:] if right else ""}'
5078
5079
5080 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5081 assert 'all' in alias_dict, '"all" alias is required'
5082 requested = list(start or [])
5083 for val in options:
5084 discard = val.startswith('-')
5085 if discard:
5086 val = val[1:]
5087
5088 if val in alias_dict:
5089 val = alias_dict[val] if not discard else [
5090 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5091 # NB: Do not allow regex in aliases for performance
5092 requested = orderedSet_from_options(val, alias_dict, start=requested)
5093 continue
5094
5095 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5096 else [val] if val in alias_dict['all'] else None)
5097 if current is None:
5098 raise ValueError(val)
5099
5100 if discard:
5101 for item in current:
5102 while item in requested:
5103 requested.remove(item)
5104 else:
5105 requested.extend(current)
5106
5107 return orderedSet(requested)
5108
5109
5110 # TODO: Rewrite
5111 class FormatSorter:
5112 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5113
5114 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5115 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5116 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5117 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5118 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5119 'fps', 'fs_approx', 'source', 'id')
5120
5121 settings = {
5122 'vcodec': {'type': 'ordered', 'regex': True,
5123 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5124 'acodec': {'type': 'ordered', 'regex': True,
5125 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5126 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5127 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5128 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5129 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5130 'vext': {'type': 'ordered', 'field': 'video_ext',
5131 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5132 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5133 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5134 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5135 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5136 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5137 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5138 'field': ('vcodec', 'acodec'),
5139 'function': lambda it: int(any(v != 'none' for v in it))},
5140 'ie_pref': {'priority': True, 'type': 'extractor'},
5141 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5142 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5143 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5144 'quality': {'convert': 'float', 'default': -1},
5145 'filesize': {'convert': 'bytes'},
5146 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5147 'id': {'convert': 'string', 'field': 'format_id'},
5148 'height': {'convert': 'float_none'},
5149 'width': {'convert': 'float_none'},
5150 'fps': {'convert': 'float_none'},
5151 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5152 'tbr': {'convert': 'float_none'},
5153 'vbr': {'convert': 'float_none'},
5154 'abr': {'convert': 'float_none'},
5155 'asr': {'convert': 'float_none'},
5156 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5157
5158 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5159 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5160 'function': lambda it: next(filter(None, it), None)},
5161 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5162 'function': lambda it: next(filter(None, it), None)},
5163 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5164 'res': {'type': 'multiple', 'field': ('height', 'width'),
5165 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5166
5167 # Actual field names
5168 'format_id': {'type': 'alias', 'field': 'id'},
5169 'preference': {'type': 'alias', 'field': 'ie_pref'},
5170 'language_preference': {'type': 'alias', 'field': 'lang'},
5171 'source_preference': {'type': 'alias', 'field': 'source'},
5172 'protocol': {'type': 'alias', 'field': 'proto'},
5173 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5174 'audio_channels': {'type': 'alias', 'field': 'channels'},
5175
5176 # Deprecated
5177 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5178 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5179 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5180 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5181 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5182 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5183 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5184 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5185 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5186 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5187 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5188 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5189 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5190 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5191 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5192 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5193 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5194 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5195 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5196 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5197 }
5198
5199 def __init__(self, ydl, field_preference):
5200 self.ydl = ydl
5201 self._order = []
5202 self.evaluate_params(self.ydl.params, field_preference)
5203 if ydl.params.get('verbose'):
5204 self.print_verbose_info(self.ydl.write_debug)
5205
5206 def _get_field_setting(self, field, key):
5207 if field not in self.settings:
5208 if key in ('forced', 'priority'):
5209 return False
5210 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5211 'deprecated and may be removed in a future version')
5212 self.settings[field] = {}
5213 propObj = self.settings[field]
5214 if key not in propObj:
5215 type = propObj.get('type')
5216 if key == 'field':
5217 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5218 elif key == 'convert':
5219 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5220 else:
5221 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5222 propObj[key] = default
5223 return propObj[key]
5224
5225 def _resolve_field_value(self, field, value, convertNone=False):
5226 if value is None:
5227 if not convertNone:
5228 return None
5229 else:
5230 value = value.lower()
5231 conversion = self._get_field_setting(field, 'convert')
5232 if conversion == 'ignore':
5233 return None
5234 if conversion == 'string':
5235 return value
5236 elif conversion == 'float_none':
5237 return float_or_none(value)
5238 elif conversion == 'bytes':
5239 return parse_bytes(value)
5240 elif conversion == 'order':
5241 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5242 use_regex = self._get_field_setting(field, 'regex')
5243 list_length = len(order_list)
5244 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5245 if use_regex and value is not None:
5246 for i, regex in enumerate(order_list):
5247 if regex and re.match(regex, value):
5248 return list_length - i
5249 return list_length - empty_pos # not in list
5250 else: # not regex or value = None
5251 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5252 else:
5253 if value.isnumeric():
5254 return float(value)
5255 else:
5256 self.settings[field]['convert'] = 'string'
5257 return value
5258
5259 def evaluate_params(self, params, sort_extractor):
5260 self._use_free_order = params.get('prefer_free_formats', False)
5261 self._sort_user = params.get('format_sort', [])
5262 self._sort_extractor = sort_extractor
5263
5264 def add_item(field, reverse, closest, limit_text):
5265 field = field.lower()
5266 if field in self._order:
5267 return
5268 self._order.append(field)
5269 limit = self._resolve_field_value(field, limit_text)
5270 data = {
5271 'reverse': reverse,
5272 'closest': False if limit is None else closest,
5273 'limit_text': limit_text,
5274 'limit': limit}
5275 if field in self.settings:
5276 self.settings[field].update(data)
5277 else:
5278 self.settings[field] = data
5279
5280 sort_list = (
5281 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5282 + (tuple() if params.get('format_sort_force', False)
5283 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5284 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5285
5286 for item in sort_list:
5287 match = re.match(self.regex, item)
5288 if match is None:
5289 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5290 field = match.group('field')
5291 if field is None:
5292 continue
5293 if self._get_field_setting(field, 'type') == 'alias':
5294 alias, field = field, self._get_field_setting(field, 'field')
5295 if self._get_field_setting(alias, 'deprecated'):
5296 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5297 f'be removed in a future version. Please use {field} instead')
5298 reverse = match.group('reverse') is not None
5299 closest = match.group('separator') == '~'
5300 limit_text = match.group('limit')
5301
5302 has_limit = limit_text is not None
5303 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5304 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5305
5306 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5307 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5308 limit_count = len(limits)
5309 for (i, f) in enumerate(fields):
5310 add_item(f, reverse, closest,
5311 limits[i] if i < limit_count
5312 else limits[0] if has_limit and not has_multiple_limits
5313 else None)
5314
5315 def print_verbose_info(self, write_debug):
5316 if self._sort_user:
5317 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5318 if self._sort_extractor:
5319 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5320 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5321 '+' if self._get_field_setting(field, 'reverse') else '', field,
5322 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5323 self._get_field_setting(field, 'limit_text'),
5324 self._get_field_setting(field, 'limit'))
5325 if self._get_field_setting(field, 'limit_text') is not None else '')
5326 for field in self._order if self._get_field_setting(field, 'visible')]))
5327
5328 def _calculate_field_preference_from_value(self, format, field, type, value):
5329 reverse = self._get_field_setting(field, 'reverse')
5330 closest = self._get_field_setting(field, 'closest')
5331 limit = self._get_field_setting(field, 'limit')
5332
5333 if type == 'extractor':
5334 maximum = self._get_field_setting(field, 'max')
5335 if value is None or (maximum is not None and value >= maximum):
5336 value = -1
5337 elif type == 'boolean':
5338 in_list = self._get_field_setting(field, 'in_list')
5339 not_in_list = self._get_field_setting(field, 'not_in_list')
5340 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5341 elif type == 'ordered':
5342 value = self._resolve_field_value(field, value, True)
5343
5344 # try to convert to number
5345 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5346 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5347 if is_num:
5348 value = val_num
5349
5350 return ((-10, 0) if value is None
5351 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5352 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5353 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5354 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5355 else (-1, value, 0))
5356
5357 def _calculate_field_preference(self, format, field):
5358 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5359 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5360 if type == 'multiple':
5361 type = 'field' # Only 'field' is allowed in multiple for now
5362 actual_fields = self._get_field_setting(field, 'field')
5363
5364 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5365 else:
5366 value = get_value(field)
5367 return self._calculate_field_preference_from_value(format, field, type, value)
5368
5369 def calculate_preference(self, format):
5370 # Determine missing protocol
5371 if not format.get('protocol'):
5372 format['protocol'] = determine_protocol(format)
5373
5374 # Determine missing ext
5375 if not format.get('ext') and 'url' in format:
5376 format['ext'] = determine_ext(format['url'])
5377 if format.get('vcodec') == 'none':
5378 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5379 format['video_ext'] = 'none'
5380 else:
5381 format['video_ext'] = format['ext']
5382 format['audio_ext'] = 'none'
5383 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5384 # format['preference'] = -1000
5385
5386 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5387 # HEVC-over-FLV is out-of-spec by FLV's original spec
5388 # ref. https://trac.ffmpeg.org/ticket/6389
5389 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5390 format['preference'] = -100
5391
5392 # Determine missing bitrates
5393 if format.get('vcodec') == 'none':
5394 format['vbr'] = 0
5395 if format.get('acodec') == 'none':
5396 format['abr'] = 0
5397 if not format.get('vbr') and format.get('vcodec') != 'none':
5398 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5399 if not format.get('abr') and format.get('acodec') != 'none':
5400 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5401 if not format.get('tbr'):
5402 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5403
5404 return tuple(self._calculate_field_preference(format, field) for field in self._order)
5405
5406
5407 # XXX: Temporary
5408 class _YDLLogger:
5409 def __init__(self, ydl=None):
5410 self._ydl = ydl
5411
5412 def debug(self, message):
5413 if self._ydl:
5414 self._ydl.write_debug(message)
5415
5416 def info(self, message):
5417 if self._ydl:
5418 self._ydl.to_screen(message)
5419
5420 def warning(self, message, *, once=False):
5421 if self._ydl:
5422 self._ydl.report_warning(message, once)
5423
5424 def error(self, message, *, is_error=True):
5425 if self._ydl:
5426 self._ydl.report_error(message, is_error=is_error)
5427
5428 def stdout(self, message):
5429 if self._ydl:
5430 self._ydl.to_stdout(message)
5431
5432 def stderr(self, message):
5433 if self._ydl:
5434 self._ydl.to_stderr(message)