]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils/_utils.py
[networking] Rewrite architecture (#2861)
[yt-dlp.git] / yt_dlp / utils / _utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import hashlib
15 import hmac
16 import html.entities
17 import html.parser
18 import inspect
19 import io
20 import itertools
21 import json
22 import locale
23 import math
24 import mimetypes
25 import netrc
26 import operator
27 import os
28 import platform
29 import random
30 import re
31 import shlex
32 import socket
33 import ssl
34 import struct
35 import subprocess
36 import sys
37 import tempfile
38 import time
39 import traceback
40 import types
41 import unicodedata
42 import urllib.error
43 import urllib.parse
44 import urllib.request
45 import xml.etree.ElementTree
46
47 from . import traversal
48
49 from ..compat import functools # isort: split
50 from ..compat import (
51 compat_etree_fromstring,
52 compat_expanduser,
53 compat_HTMLParseError,
54 compat_os_name,
55 compat_shlex_quote,
56 )
57 from ..dependencies import websockets, xattr
58
59 __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
60
61 # This is not clearly defined otherwise
62 compiled_regex_type = type(re.compile(''))
63
64
65 USER_AGENTS = {
66 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
67 }
68
69
70 class NO_DEFAULT:
71 pass
72
73
74 def IDENTITY(x):
75 return x
76
77
78 ENGLISH_MONTH_NAMES = [
79 'January', 'February', 'March', 'April', 'May', 'June',
80 'July', 'August', 'September', 'October', 'November', 'December']
81
82 MONTH_NAMES = {
83 'en': ENGLISH_MONTH_NAMES,
84 'fr': [
85 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
86 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
87 # these follow the genitive grammatical case (dopełniacz)
88 # some websites might be using nominative, which will require another month list
89 # https://en.wikibooks.org/wiki/Polish/Noun_cases
90 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
91 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
92 }
93
94 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
95 TIMEZONE_NAMES = {
96 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
97 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
98 'EST': -5, 'EDT': -4, # Eastern
99 'CST': -6, 'CDT': -5, # Central
100 'MST': -7, 'MDT': -6, # Mountain
101 'PST': -8, 'PDT': -7 # Pacific
102 }
103
104 # needed for sanitizing filenames in restricted mode
105 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
106 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
107 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
108
109 DATE_FORMATS = (
110 '%d %B %Y',
111 '%d %b %Y',
112 '%B %d %Y',
113 '%B %dst %Y',
114 '%B %dnd %Y',
115 '%B %drd %Y',
116 '%B %dth %Y',
117 '%b %d %Y',
118 '%b %dst %Y',
119 '%b %dnd %Y',
120 '%b %drd %Y',
121 '%b %dth %Y',
122 '%b %dst %Y %I:%M',
123 '%b %dnd %Y %I:%M',
124 '%b %drd %Y %I:%M',
125 '%b %dth %Y %I:%M',
126 '%Y %m %d',
127 '%Y-%m-%d',
128 '%Y.%m.%d.',
129 '%Y/%m/%d',
130 '%Y/%m/%d %H:%M',
131 '%Y/%m/%d %H:%M:%S',
132 '%Y%m%d%H%M',
133 '%Y%m%d%H%M%S',
134 '%Y%m%d',
135 '%Y-%m-%d %H:%M',
136 '%Y-%m-%d %H:%M:%S',
137 '%Y-%m-%d %H:%M:%S.%f',
138 '%Y-%m-%d %H:%M:%S:%f',
139 '%d.%m.%Y %H:%M',
140 '%d.%m.%Y %H.%M',
141 '%Y-%m-%dT%H:%M:%SZ',
142 '%Y-%m-%dT%H:%M:%S.%fZ',
143 '%Y-%m-%dT%H:%M:%S.%f0Z',
144 '%Y-%m-%dT%H:%M:%S',
145 '%Y-%m-%dT%H:%M:%S.%f',
146 '%Y-%m-%dT%H:%M',
147 '%b %d %Y at %H:%M',
148 '%b %d %Y at %H:%M:%S',
149 '%B %d %Y at %H:%M',
150 '%B %d %Y at %H:%M:%S',
151 '%H:%M %d-%b-%Y',
152 )
153
154 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
155 DATE_FORMATS_DAY_FIRST.extend([
156 '%d-%m-%Y',
157 '%d.%m.%Y',
158 '%d.%m.%y',
159 '%d/%m/%Y',
160 '%d/%m/%y',
161 '%d/%m/%Y %H:%M:%S',
162 '%d-%m-%Y %H:%M',
163 '%H:%M %d/%m/%Y',
164 ])
165
166 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
167 DATE_FORMATS_MONTH_FIRST.extend([
168 '%m-%d-%Y',
169 '%m.%d.%Y',
170 '%m/%d/%Y',
171 '%m/%d/%y',
172 '%m/%d/%Y %H:%M:%S',
173 ])
174
175 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
176 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
177
178 NUMBER_RE = r'\d+(?:\.\d+)?'
179
180
181 @functools.cache
182 def preferredencoding():
183 """Get preferred encoding.
184
185 Returns the best encoding scheme for the system, based on
186 locale.getpreferredencoding() and some further tweaks.
187 """
188 try:
189 pref = locale.getpreferredencoding()
190 'TEST'.encode(pref)
191 except Exception:
192 pref = 'UTF-8'
193
194 return pref
195
196
197 def write_json_file(obj, fn):
198 """ Encode obj as JSON and write it to fn, atomically if possible """
199
200 tf = tempfile.NamedTemporaryFile(
201 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
202 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
203
204 try:
205 with tf:
206 json.dump(obj, tf, ensure_ascii=False)
207 if sys.platform == 'win32':
208 # Need to remove existing file on Windows, else os.rename raises
209 # WindowsError or FileExistsError.
210 with contextlib.suppress(OSError):
211 os.unlink(fn)
212 with contextlib.suppress(OSError):
213 mask = os.umask(0)
214 os.umask(mask)
215 os.chmod(tf.name, 0o666 & ~mask)
216 os.rename(tf.name, fn)
217 except Exception:
218 with contextlib.suppress(OSError):
219 os.remove(tf.name)
220 raise
221
222
223 def find_xpath_attr(node, xpath, key, val=None):
224 """ Find the xpath xpath[@key=val] """
225 assert re.match(r'^[a-zA-Z_-]+$', key)
226 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
227 return node.find(expr)
228
229 # On python2.6 the xml.etree.ElementTree.Element methods don't support
230 # the namespace parameter
231
232
233 def xpath_with_ns(path, ns_map):
234 components = [c.split(':') for c in path.split('/')]
235 replaced = []
236 for c in components:
237 if len(c) == 1:
238 replaced.append(c[0])
239 else:
240 ns, tag = c
241 replaced.append('{%s}%s' % (ns_map[ns], tag))
242 return '/'.join(replaced)
243
244
245 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
246 def _find_xpath(xpath):
247 return node.find(xpath)
248
249 if isinstance(xpath, str):
250 n = _find_xpath(xpath)
251 else:
252 for xp in xpath:
253 n = _find_xpath(xp)
254 if n is not None:
255 break
256
257 if n is None:
258 if default is not NO_DEFAULT:
259 return default
260 elif fatal:
261 name = xpath if name is None else name
262 raise ExtractorError('Could not find XML element %s' % name)
263 else:
264 return None
265 return n
266
267
268 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
269 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
270 if n is None or n == default:
271 return n
272 if n.text is None:
273 if default is not NO_DEFAULT:
274 return default
275 elif fatal:
276 name = xpath if name is None else name
277 raise ExtractorError('Could not find XML element\'s text %s' % name)
278 else:
279 return None
280 return n.text
281
282
283 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
284 n = find_xpath_attr(node, xpath, key)
285 if n is None:
286 if default is not NO_DEFAULT:
287 return default
288 elif fatal:
289 name = f'{xpath}[@{key}]' if name is None else name
290 raise ExtractorError('Could not find XML attribute %s' % name)
291 else:
292 return None
293 return n.attrib[key]
294
295
296 def get_element_by_id(id, html, **kwargs):
297 """Return the content of the tag with the specified ID in the passed HTML document"""
298 return get_element_by_attribute('id', id, html, **kwargs)
299
300
301 def get_element_html_by_id(id, html, **kwargs):
302 """Return the html of the tag with the specified ID in the passed HTML document"""
303 return get_element_html_by_attribute('id', id, html, **kwargs)
304
305
306 def get_element_by_class(class_name, html):
307 """Return the content of the first tag with the specified class in the passed HTML document"""
308 retval = get_elements_by_class(class_name, html)
309 return retval[0] if retval else None
310
311
312 def get_element_html_by_class(class_name, html):
313 """Return the html of the first tag with the specified class in the passed HTML document"""
314 retval = get_elements_html_by_class(class_name, html)
315 return retval[0] if retval else None
316
317
318 def get_element_by_attribute(attribute, value, html, **kwargs):
319 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
320 return retval[0] if retval else None
321
322
323 def get_element_html_by_attribute(attribute, value, html, **kargs):
324 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
325 return retval[0] if retval else None
326
327
328 def get_elements_by_class(class_name, html, **kargs):
329 """Return the content of all tags with the specified class in the passed HTML document as a list"""
330 return get_elements_by_attribute(
331 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
332 html, escape_value=False)
333
334
335 def get_elements_html_by_class(class_name, html):
336 """Return the html of all tags with the specified class in the passed HTML document as a list"""
337 return get_elements_html_by_attribute(
338 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
339 html, escape_value=False)
340
341
342 def get_elements_by_attribute(*args, **kwargs):
343 """Return the content of the tag with the specified attribute in the passed HTML document"""
344 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
345
346
347 def get_elements_html_by_attribute(*args, **kwargs):
348 """Return the html of the tag with the specified attribute in the passed HTML document"""
349 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
350
351
352 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
353 """
354 Return the text (content) and the html (whole) of the tag with the specified
355 attribute in the passed HTML document
356 """
357 if not value:
358 return
359
360 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
361
362 value = re.escape(value) if escape_value else value
363
364 partial_element_re = rf'''(?x)
365 <(?P<tag>{tag})
366 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
367 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
368 '''
369
370 for m in re.finditer(partial_element_re, html):
371 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
372
373 yield (
374 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
375 whole
376 )
377
378
379 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
380 """
381 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
382 closing tag for the first opening tag it has encountered, and can be used
383 as a context manager
384 """
385
386 class HTMLBreakOnClosingTagException(Exception):
387 pass
388
389 def __init__(self):
390 self.tagstack = collections.deque()
391 html.parser.HTMLParser.__init__(self)
392
393 def __enter__(self):
394 return self
395
396 def __exit__(self, *_):
397 self.close()
398
399 def close(self):
400 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
401 # so data remains buffered; we no longer have any interest in it, thus
402 # override this method to discard it
403 pass
404
405 def handle_starttag(self, tag, _):
406 self.tagstack.append(tag)
407
408 def handle_endtag(self, tag):
409 if not self.tagstack:
410 raise compat_HTMLParseError('no tags in the stack')
411 while self.tagstack:
412 inner_tag = self.tagstack.pop()
413 if inner_tag == tag:
414 break
415 else:
416 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
417 if not self.tagstack:
418 raise self.HTMLBreakOnClosingTagException()
419
420
421 # XXX: This should be far less strict
422 def get_element_text_and_html_by_tag(tag, html):
423 """
424 For the first element with the specified tag in the passed HTML document
425 return its' content (text) and the whole element (html)
426 """
427 def find_or_raise(haystack, needle, exc):
428 try:
429 return haystack.index(needle)
430 except ValueError:
431 raise exc
432 closing_tag = f'</{tag}>'
433 whole_start = find_or_raise(
434 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
435 content_start = find_or_raise(
436 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
437 content_start += whole_start + 1
438 with HTMLBreakOnClosingTagParser() as parser:
439 parser.feed(html[whole_start:content_start])
440 if not parser.tagstack or parser.tagstack[0] != tag:
441 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
442 offset = content_start
443 while offset < len(html):
444 next_closing_tag_start = find_or_raise(
445 html[offset:], closing_tag,
446 compat_HTMLParseError(f'closing {tag} tag not found'))
447 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
448 try:
449 parser.feed(html[offset:offset + next_closing_tag_end])
450 offset += next_closing_tag_end
451 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
452 return html[content_start:offset + next_closing_tag_start], \
453 html[whole_start:offset + next_closing_tag_end]
454 raise compat_HTMLParseError('unexpected end of html')
455
456
457 class HTMLAttributeParser(html.parser.HTMLParser):
458 """Trivial HTML parser to gather the attributes for a single element"""
459
460 def __init__(self):
461 self.attrs = {}
462 html.parser.HTMLParser.__init__(self)
463
464 def handle_starttag(self, tag, attrs):
465 self.attrs = dict(attrs)
466 raise compat_HTMLParseError('done')
467
468
469 class HTMLListAttrsParser(html.parser.HTMLParser):
470 """HTML parser to gather the attributes for the elements of a list"""
471
472 def __init__(self):
473 html.parser.HTMLParser.__init__(self)
474 self.items = []
475 self._level = 0
476
477 def handle_starttag(self, tag, attrs):
478 if tag == 'li' and self._level == 0:
479 self.items.append(dict(attrs))
480 self._level += 1
481
482 def handle_endtag(self, tag):
483 self._level -= 1
484
485
486 def extract_attributes(html_element):
487 """Given a string for an HTML element such as
488 <el
489 a="foo" B="bar" c="&98;az" d=boz
490 empty= noval entity="&amp;"
491 sq='"' dq="'"
492 >
493 Decode and return a dictionary of attributes.
494 {
495 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
496 'empty': '', 'noval': None, 'entity': '&',
497 'sq': '"', 'dq': '\''
498 }.
499 """
500 parser = HTMLAttributeParser()
501 with contextlib.suppress(compat_HTMLParseError):
502 parser.feed(html_element)
503 parser.close()
504 return parser.attrs
505
506
507 def parse_list(webpage):
508 """Given a string for an series of HTML <li> elements,
509 return a dictionary of their attributes"""
510 parser = HTMLListAttrsParser()
511 parser.feed(webpage)
512 parser.close()
513 return parser.items
514
515
516 def clean_html(html):
517 """Clean an HTML snippet into a readable string"""
518
519 if html is None: # Convenience for sanitizing descriptions etc.
520 return html
521
522 html = re.sub(r'\s+', ' ', html)
523 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
524 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
525 # Strip html tags
526 html = re.sub('<.*?>', '', html)
527 # Replace html entities
528 html = unescapeHTML(html)
529 return html.strip()
530
531
532 class LenientJSONDecoder(json.JSONDecoder):
533 # TODO: Write tests
534 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
535 self.transform_source, self.ignore_extra = transform_source, ignore_extra
536 self._close_attempts = 2 * close_objects
537 super().__init__(*args, **kwargs)
538
539 @staticmethod
540 def _close_object(err):
541 doc = err.doc[:err.pos]
542 # We need to add comma first to get the correct error message
543 if err.msg.startswith('Expecting \',\''):
544 return doc + ','
545 elif not doc.endswith(','):
546 return
547
548 if err.msg.startswith('Expecting property name'):
549 return doc[:-1] + '}'
550 elif err.msg.startswith('Expecting value'):
551 return doc[:-1] + ']'
552
553 def decode(self, s):
554 if self.transform_source:
555 s = self.transform_source(s)
556 for attempt in range(self._close_attempts + 1):
557 try:
558 if self.ignore_extra:
559 return self.raw_decode(s.lstrip())[0]
560 return super().decode(s)
561 except json.JSONDecodeError as e:
562 if e.pos is None:
563 raise
564 elif attempt < self._close_attempts:
565 s = self._close_object(e)
566 if s is not None:
567 continue
568 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
569 assert False, 'Too many attempts to decode JSON'
570
571
572 def sanitize_open(filename, open_mode):
573 """Try to open the given filename, and slightly tweak it if this fails.
574
575 Attempts to open the given filename. If this fails, it tries to change
576 the filename slightly, step by step, until it's either able to open it
577 or it fails and raises a final exception, like the standard open()
578 function.
579
580 It returns the tuple (stream, definitive_file_name).
581 """
582 if filename == '-':
583 if sys.platform == 'win32':
584 import msvcrt
585
586 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
587 with contextlib.suppress(io.UnsupportedOperation):
588 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
589 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
590
591 for attempt in range(2):
592 try:
593 try:
594 if sys.platform == 'win32':
595 # FIXME: An exclusive lock also locks the file from being read.
596 # Since windows locks are mandatory, don't lock the file on windows (for now).
597 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
598 raise LockingUnsupportedError()
599 stream = locked_file(filename, open_mode, block=False).__enter__()
600 except OSError:
601 stream = open(filename, open_mode)
602 return stream, filename
603 except OSError as err:
604 if attempt or err.errno in (errno.EACCES,):
605 raise
606 old_filename, filename = filename, sanitize_path(filename)
607 if old_filename == filename:
608 raise
609
610
611 def timeconvert(timestr):
612 """Convert RFC 2822 defined time string into system timestamp"""
613 timestamp = None
614 timetuple = email.utils.parsedate_tz(timestr)
615 if timetuple is not None:
616 timestamp = email.utils.mktime_tz(timetuple)
617 return timestamp
618
619
620 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
621 """Sanitizes a string so it could be used as part of a filename.
622 @param restricted Use a stricter subset of allowed characters
623 @param is_id Whether this is an ID that should be kept unchanged if possible.
624 If unset, yt-dlp's new sanitization rules are in effect
625 """
626 if s == '':
627 return ''
628
629 def replace_insane(char):
630 if restricted and char in ACCENT_CHARS:
631 return ACCENT_CHARS[char]
632 elif not restricted and char == '\n':
633 return '\0 '
634 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
635 # Replace with their full-width unicode counterparts
636 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
637 elif char == '?' or ord(char) < 32 or ord(char) == 127:
638 return ''
639 elif char == '"':
640 return '' if restricted else '\''
641 elif char == ':':
642 return '\0_\0-' if restricted else '\0 \0-'
643 elif char in '\\/|*<>':
644 return '\0_'
645 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
646 return '\0_'
647 return char
648
649 # Replace look-alike Unicode glyphs
650 if restricted and (is_id is NO_DEFAULT or not is_id):
651 s = unicodedata.normalize('NFKC', s)
652 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
653 result = ''.join(map(replace_insane, s))
654 if is_id is NO_DEFAULT:
655 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
656 STRIP_RE = r'(?:\0.|[ _-])*'
657 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
658 result = result.replace('\0', '') or '_'
659
660 if not is_id:
661 while '__' in result:
662 result = result.replace('__', '_')
663 result = result.strip('_')
664 # Common case of "Foreign band name - English song title"
665 if restricted and result.startswith('-_'):
666 result = result[2:]
667 if result.startswith('-'):
668 result = '_' + result[len('-'):]
669 result = result.lstrip('.')
670 if not result:
671 result = '_'
672 return result
673
674
675 def sanitize_path(s, force=False):
676 """Sanitizes and normalizes path on Windows"""
677 if sys.platform == 'win32':
678 force = False
679 drive_or_unc, _ = os.path.splitdrive(s)
680 elif force:
681 drive_or_unc = ''
682 else:
683 return s
684
685 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
686 if drive_or_unc:
687 norm_path.pop(0)
688 sanitized_path = [
689 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
690 for path_part in norm_path]
691 if drive_or_unc:
692 sanitized_path.insert(0, drive_or_unc + os.path.sep)
693 elif force and s and s[0] == os.path.sep:
694 sanitized_path.insert(0, os.path.sep)
695 return os.path.join(*sanitized_path)
696
697
698 def sanitize_url(url, *, scheme='http'):
699 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
700 # the number of unwanted failures due to missing protocol
701 if url is None:
702 return
703 elif url.startswith('//'):
704 return f'{scheme}:{url}'
705 # Fix some common typos seen so far
706 COMMON_TYPOS = (
707 # https://github.com/ytdl-org/youtube-dl/issues/15649
708 (r'^httpss://', r'https://'),
709 # https://bx1.be/lives/direct-tv/
710 (r'^rmtp([es]?)://', r'rtmp\1://'),
711 )
712 for mistake, fixup in COMMON_TYPOS:
713 if re.match(mistake, url):
714 return re.sub(mistake, fixup, url)
715 return url
716
717
718 def extract_basic_auth(url):
719 parts = urllib.parse.urlsplit(url)
720 if parts.username is None:
721 return url, None
722 url = urllib.parse.urlunsplit(parts._replace(netloc=(
723 parts.hostname if parts.port is None
724 else '%s:%d' % (parts.hostname, parts.port))))
725 auth_payload = base64.b64encode(
726 ('%s:%s' % (parts.username, parts.password or '')).encode())
727 return url, f'Basic {auth_payload.decode()}'
728
729
730 def sanitized_Request(url, *args, **kwargs):
731 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
732 if auth_header is not None:
733 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
734 headers['Authorization'] = auth_header
735 return urllib.request.Request(url, *args, **kwargs)
736
737
738 def expand_path(s):
739 """Expand shell variables and ~"""
740 return os.path.expandvars(compat_expanduser(s))
741
742
743 def orderedSet(iterable, *, lazy=False):
744 """Remove all duplicates from the input iterable"""
745 def _iter():
746 seen = [] # Do not use set since the items can be unhashable
747 for x in iterable:
748 if x not in seen:
749 seen.append(x)
750 yield x
751
752 return _iter() if lazy else list(_iter())
753
754
755 def _htmlentity_transform(entity_with_semicolon):
756 """Transforms an HTML entity to a character."""
757 entity = entity_with_semicolon[:-1]
758
759 # Known non-numeric HTML entity
760 if entity in html.entities.name2codepoint:
761 return chr(html.entities.name2codepoint[entity])
762
763 # TODO: HTML5 allows entities without a semicolon.
764 # E.g. '&Eacuteric' should be decoded as 'Éric'.
765 if entity_with_semicolon in html.entities.html5:
766 return html.entities.html5[entity_with_semicolon]
767
768 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
769 if mobj is not None:
770 numstr = mobj.group(1)
771 if numstr.startswith('x'):
772 base = 16
773 numstr = '0%s' % numstr
774 else:
775 base = 10
776 # See https://github.com/ytdl-org/youtube-dl/issues/7518
777 with contextlib.suppress(ValueError):
778 return chr(int(numstr, base))
779
780 # Unknown entity in name, return its literal representation
781 return '&%s;' % entity
782
783
784 def unescapeHTML(s):
785 if s is None:
786 return None
787 assert isinstance(s, str)
788
789 return re.sub(
790 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
791
792
793 def escapeHTML(text):
794 return (
795 text
796 .replace('&', '&amp;')
797 .replace('<', '&lt;')
798 .replace('>', '&gt;')
799 .replace('"', '&quot;')
800 .replace("'", '&#39;')
801 )
802
803
804 class netrc_from_content(netrc.netrc):
805 def __init__(self, content):
806 self.hosts, self.macros = {}, {}
807 with io.StringIO(content) as stream:
808 self._parse('-', stream, False)
809
810
811 class Popen(subprocess.Popen):
812 if sys.platform == 'win32':
813 _startupinfo = subprocess.STARTUPINFO()
814 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
815 else:
816 _startupinfo = None
817
818 @staticmethod
819 def _fix_pyinstaller_ld_path(env):
820 """Restore LD_LIBRARY_PATH when using PyInstaller
821 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
822 https://github.com/yt-dlp/yt-dlp/issues/4573
823 """
824 if not hasattr(sys, '_MEIPASS'):
825 return
826
827 def _fix(key):
828 orig = env.get(f'{key}_ORIG')
829 if orig is None:
830 env.pop(key, None)
831 else:
832 env[key] = orig
833
834 _fix('LD_LIBRARY_PATH') # Linux
835 _fix('DYLD_LIBRARY_PATH') # macOS
836
837 def __init__(self, *args, env=None, text=False, **kwargs):
838 if env is None:
839 env = os.environ.copy()
840 self._fix_pyinstaller_ld_path(env)
841
842 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
843 if text is True:
844 kwargs['universal_newlines'] = True # For 3.6 compatibility
845 kwargs.setdefault('encoding', 'utf-8')
846 kwargs.setdefault('errors', 'replace')
847 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
848
849 def communicate_or_kill(self, *args, **kwargs):
850 try:
851 return self.communicate(*args, **kwargs)
852 except BaseException: # Including KeyboardInterrupt
853 self.kill(timeout=None)
854 raise
855
856 def kill(self, *, timeout=0):
857 super().kill()
858 if timeout != 0:
859 self.wait(timeout=timeout)
860
861 @classmethod
862 def run(cls, *args, timeout=None, **kwargs):
863 with cls(*args, **kwargs) as proc:
864 default = '' if proc.__text_mode else b''
865 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
866 return stdout or default, stderr or default, proc.returncode
867
868
869 def encodeArgument(s):
870 # Legacy code that uses byte strings
871 # Uncomment the following line after fixing all post processors
872 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
873 return s if isinstance(s, str) else s.decode('ascii')
874
875
876 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
877
878
879 def timetuple_from_msec(msec):
880 secs, msec = divmod(msec, 1000)
881 mins, secs = divmod(secs, 60)
882 hrs, mins = divmod(mins, 60)
883 return _timetuple(hrs, mins, secs, msec)
884
885
886 def formatSeconds(secs, delim=':', msec=False):
887 time = timetuple_from_msec(secs * 1000)
888 if time.hours:
889 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
890 elif time.minutes:
891 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
892 else:
893 ret = '%d' % time.seconds
894 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
895
896
897 def make_HTTPS_handler(params, **kwargs):
898 from ._deprecated import YoutubeDLHTTPSHandler
899 from ..networking._helper import make_ssl_context
900 return YoutubeDLHTTPSHandler(params, context=make_ssl_context(
901 verify=not params.get('nocheckcertificate'),
902 client_certificate=params.get('client_certificate'),
903 client_certificate_key=params.get('client_certificate_key'),
904 client_certificate_password=params.get('client_certificate_password'),
905 legacy_support=params.get('legacyserverconnect'),
906 use_certifi='no-certifi' not in params.get('compat_opts', []),
907 ), **kwargs)
908
909
910 def bug_reports_message(before=';'):
911 from ..update import REPOSITORY
912
913 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
914 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
915
916 before = before.rstrip()
917 if not before or before.endswith(('.', '!', '?')):
918 msg = msg[0].title() + msg[1:]
919
920 return (before + ' ' if before else '') + msg
921
922
923 class YoutubeDLError(Exception):
924 """Base exception for YoutubeDL errors."""
925 msg = None
926
927 def __init__(self, msg=None):
928 if msg is not None:
929 self.msg = msg
930 elif self.msg is None:
931 self.msg = type(self).__name__
932 super().__init__(self.msg)
933
934
935 class ExtractorError(YoutubeDLError):
936 """Error during info extraction."""
937
938 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
939 """ tb, if given, is the original traceback (so that it can be printed out).
940 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
941 """
942 from ..networking.exceptions import network_exceptions
943 if sys.exc_info()[0] in network_exceptions:
944 expected = True
945
946 self.orig_msg = str(msg)
947 self.traceback = tb
948 self.expected = expected
949 self.cause = cause
950 self.video_id = video_id
951 self.ie = ie
952 self.exc_info = sys.exc_info() # preserve original exception
953 if isinstance(self.exc_info[1], ExtractorError):
954 self.exc_info = self.exc_info[1].exc_info
955 super().__init__(self.__msg)
956
957 @property
958 def __msg(self):
959 return ''.join((
960 format_field(self.ie, None, '[%s] '),
961 format_field(self.video_id, None, '%s: '),
962 self.orig_msg,
963 format_field(self.cause, None, ' (caused by %r)'),
964 '' if self.expected else bug_reports_message()))
965
966 def format_traceback(self):
967 return join_nonempty(
968 self.traceback and ''.join(traceback.format_tb(self.traceback)),
969 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
970 delim='\n') or None
971
972 def __setattr__(self, name, value):
973 super().__setattr__(name, value)
974 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
975 self.msg = self.__msg or type(self).__name__
976 self.args = (self.msg, ) # Cannot be property
977
978
979 class UnsupportedError(ExtractorError):
980 def __init__(self, url):
981 super().__init__(
982 'Unsupported URL: %s' % url, expected=True)
983 self.url = url
984
985
986 class RegexNotFoundError(ExtractorError):
987 """Error when a regex didn't match"""
988 pass
989
990
991 class GeoRestrictedError(ExtractorError):
992 """Geographic restriction Error exception.
993
994 This exception may be thrown when a video is not available from your
995 geographic location due to geographic restrictions imposed by a website.
996 """
997
998 def __init__(self, msg, countries=None, **kwargs):
999 kwargs['expected'] = True
1000 super().__init__(msg, **kwargs)
1001 self.countries = countries
1002
1003
1004 class UserNotLive(ExtractorError):
1005 """Error when a channel/user is not live"""
1006
1007 def __init__(self, msg=None, **kwargs):
1008 kwargs['expected'] = True
1009 super().__init__(msg or 'The channel is not currently live', **kwargs)
1010
1011
1012 class DownloadError(YoutubeDLError):
1013 """Download Error exception.
1014
1015 This exception may be thrown by FileDownloader objects if they are not
1016 configured to continue on errors. They will contain the appropriate
1017 error message.
1018 """
1019
1020 def __init__(self, msg, exc_info=None):
1021 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1022 super().__init__(msg)
1023 self.exc_info = exc_info
1024
1025
1026 class EntryNotInPlaylist(YoutubeDLError):
1027 """Entry not in playlist exception.
1028
1029 This exception will be thrown by YoutubeDL when a requested entry
1030 is not found in the playlist info_dict
1031 """
1032 msg = 'Entry not found in info'
1033
1034
1035 class SameFileError(YoutubeDLError):
1036 """Same File exception.
1037
1038 This exception will be thrown by FileDownloader objects if they detect
1039 multiple files would have to be downloaded to the same file on disk.
1040 """
1041 msg = 'Fixed output name but more than one file to download'
1042
1043 def __init__(self, filename=None):
1044 if filename is not None:
1045 self.msg += f': {filename}'
1046 super().__init__(self.msg)
1047
1048
1049 class PostProcessingError(YoutubeDLError):
1050 """Post Processing exception.
1051
1052 This exception may be raised by PostProcessor's .run() method to
1053 indicate an error in the postprocessing task.
1054 """
1055
1056
1057 class DownloadCancelled(YoutubeDLError):
1058 """ Exception raised when the download queue should be interrupted """
1059 msg = 'The download was cancelled'
1060
1061
1062 class ExistingVideoReached(DownloadCancelled):
1063 """ --break-on-existing triggered """
1064 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1065
1066
1067 class RejectedVideoReached(DownloadCancelled):
1068 """ --break-match-filter triggered """
1069 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1070
1071
1072 class MaxDownloadsReached(DownloadCancelled):
1073 """ --max-downloads limit has been reached. """
1074 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1075
1076
1077 class ReExtractInfo(YoutubeDLError):
1078 """ Video info needs to be re-extracted. """
1079
1080 def __init__(self, msg, expected=False):
1081 super().__init__(msg)
1082 self.expected = expected
1083
1084
1085 class ThrottledDownload(ReExtractInfo):
1086 """ Download speed below --throttled-rate. """
1087 msg = 'The download speed is below throttle limit'
1088
1089 def __init__(self):
1090 super().__init__(self.msg, expected=False)
1091
1092
1093 class UnavailableVideoError(YoutubeDLError):
1094 """Unavailable Format exception.
1095
1096 This exception will be thrown when a video is requested
1097 in a format that is not available for that video.
1098 """
1099 msg = 'Unable to download video'
1100
1101 def __init__(self, err=None):
1102 if err is not None:
1103 self.msg += f': {err}'
1104 super().__init__(self.msg)
1105
1106
1107 class ContentTooShortError(YoutubeDLError):
1108 """Content Too Short exception.
1109
1110 This exception may be raised by FileDownloader objects when a file they
1111 download is too small for what the server announced first, indicating
1112 the connection was probably interrupted.
1113 """
1114
1115 def __init__(self, downloaded, expected):
1116 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1117 # Both in bytes
1118 self.downloaded = downloaded
1119 self.expected = expected
1120
1121
1122 class XAttrMetadataError(YoutubeDLError):
1123 def __init__(self, code=None, msg='Unknown error'):
1124 super().__init__(msg)
1125 self.code = code
1126 self.msg = msg
1127
1128 # Parsing code and msg
1129 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1130 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1131 self.reason = 'NO_SPACE'
1132 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1133 self.reason = 'VALUE_TOO_LONG'
1134 else:
1135 self.reason = 'NOT_SUPPORTED'
1136
1137
1138 class XAttrUnavailableError(YoutubeDLError):
1139 pass
1140
1141
1142 def is_path_like(f):
1143 return isinstance(f, (str, bytes, os.PathLike))
1144
1145
1146 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1147 def __init__(self, cookiejar=None):
1148 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1149
1150 def http_response(self, request, response):
1151 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1152
1153 https_request = urllib.request.HTTPCookieProcessor.http_request
1154 https_response = http_response
1155
1156
1157 def extract_timezone(date_str):
1158 m = re.search(
1159 r'''(?x)
1160 ^.{8,}? # >=8 char non-TZ prefix, if present
1161 (?P<tz>Z| # just the UTC Z, or
1162 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1163 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1164 [ ]? # optional space
1165 (?P<sign>\+|-) # +/-
1166 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1167 $)
1168 ''', date_str)
1169 if not m:
1170 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1171 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1172 if timezone is not None:
1173 date_str = date_str[:-len(m.group('tz'))]
1174 timezone = datetime.timedelta(hours=timezone or 0)
1175 else:
1176 date_str = date_str[:-len(m.group('tz'))]
1177 if not m.group('sign'):
1178 timezone = datetime.timedelta()
1179 else:
1180 sign = 1 if m.group('sign') == '+' else -1
1181 timezone = datetime.timedelta(
1182 hours=sign * int(m.group('hours')),
1183 minutes=sign * int(m.group('minutes')))
1184 return timezone, date_str
1185
1186
1187 def parse_iso8601(date_str, delimiter='T', timezone=None):
1188 """ Return a UNIX timestamp from the given date """
1189
1190 if date_str is None:
1191 return None
1192
1193 date_str = re.sub(r'\.[0-9]+', '', date_str)
1194
1195 if timezone is None:
1196 timezone, date_str = extract_timezone(date_str)
1197
1198 with contextlib.suppress(ValueError):
1199 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1200 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1201 return calendar.timegm(dt.timetuple())
1202
1203
1204 def date_formats(day_first=True):
1205 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1206
1207
1208 def unified_strdate(date_str, day_first=True):
1209 """Return a string with the date in the format YYYYMMDD"""
1210
1211 if date_str is None:
1212 return None
1213 upload_date = None
1214 # Replace commas
1215 date_str = date_str.replace(',', ' ')
1216 # Remove AM/PM + timezone
1217 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1218 _, date_str = extract_timezone(date_str)
1219
1220 for expression in date_formats(day_first):
1221 with contextlib.suppress(ValueError):
1222 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1223 if upload_date is None:
1224 timetuple = email.utils.parsedate_tz(date_str)
1225 if timetuple:
1226 with contextlib.suppress(ValueError):
1227 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1228 if upload_date is not None:
1229 return str(upload_date)
1230
1231
1232 def unified_timestamp(date_str, day_first=True):
1233 if not isinstance(date_str, str):
1234 return None
1235
1236 date_str = re.sub(r'\s+', ' ', re.sub(
1237 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1238
1239 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1240 timezone, date_str = extract_timezone(date_str)
1241
1242 # Remove AM/PM + timezone
1243 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1244
1245 # Remove unrecognized timezones from ISO 8601 alike timestamps
1246 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1247 if m:
1248 date_str = date_str[:-len(m.group('tz'))]
1249
1250 # Python only supports microseconds, so remove nanoseconds
1251 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1252 if m:
1253 date_str = m.group(1)
1254
1255 for expression in date_formats(day_first):
1256 with contextlib.suppress(ValueError):
1257 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1258 return calendar.timegm(dt.timetuple())
1259
1260 timetuple = email.utils.parsedate_tz(date_str)
1261 if timetuple:
1262 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1263
1264
1265 def determine_ext(url, default_ext='unknown_video'):
1266 if url is None or '.' not in url:
1267 return default_ext
1268 guess = url.partition('?')[0].rpartition('.')[2]
1269 if re.match(r'^[A-Za-z0-9]+$', guess):
1270 return guess
1271 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1272 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1273 return guess.rstrip('/')
1274 else:
1275 return default_ext
1276
1277
1278 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1279 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1280
1281
1282 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1283 R"""
1284 Return a datetime object from a string.
1285 Supported format:
1286 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1287
1288 @param format strftime format of DATE
1289 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1290 auto: round to the unit provided in date_str (if applicable).
1291 """
1292 auto_precision = False
1293 if precision == 'auto':
1294 auto_precision = True
1295 precision = 'microsecond'
1296 today = datetime_round(datetime.datetime.utcnow(), precision)
1297 if date_str in ('now', 'today'):
1298 return today
1299 if date_str == 'yesterday':
1300 return today - datetime.timedelta(days=1)
1301 match = re.match(
1302 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1303 date_str)
1304 if match is not None:
1305 start_time = datetime_from_str(match.group('start'), precision, format)
1306 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1307 unit = match.group('unit')
1308 if unit == 'month' or unit == 'year':
1309 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1310 unit = 'day'
1311 else:
1312 if unit == 'week':
1313 unit = 'day'
1314 time *= 7
1315 delta = datetime.timedelta(**{unit + 's': time})
1316 new_date = start_time + delta
1317 if auto_precision:
1318 return datetime_round(new_date, unit)
1319 return new_date
1320
1321 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1322
1323
1324 def date_from_str(date_str, format='%Y%m%d', strict=False):
1325 R"""
1326 Return a date object from a string using datetime_from_str
1327
1328 @param strict Restrict allowed patterns to "YYYYMMDD" and
1329 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1330 """
1331 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1332 raise ValueError(f'Invalid date format "{date_str}"')
1333 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1334
1335
1336 def datetime_add_months(dt, months):
1337 """Increment/Decrement a datetime object by months."""
1338 month = dt.month + months - 1
1339 year = dt.year + month // 12
1340 month = month % 12 + 1
1341 day = min(dt.day, calendar.monthrange(year, month)[1])
1342 return dt.replace(year, month, day)
1343
1344
1345 def datetime_round(dt, precision='day'):
1346 """
1347 Round a datetime object's time to a specific precision
1348 """
1349 if precision == 'microsecond':
1350 return dt
1351
1352 unit_seconds = {
1353 'day': 86400,
1354 'hour': 3600,
1355 'minute': 60,
1356 'second': 1,
1357 }
1358 roundto = lambda x, n: ((x + n / 2) // n) * n
1359 timestamp = calendar.timegm(dt.timetuple())
1360 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1361
1362
1363 def hyphenate_date(date_str):
1364 """
1365 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1366 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1367 if match is not None:
1368 return '-'.join(match.groups())
1369 else:
1370 return date_str
1371
1372
1373 class DateRange:
1374 """Represents a time interval between two dates"""
1375
1376 def __init__(self, start=None, end=None):
1377 """start and end must be strings in the format accepted by date"""
1378 if start is not None:
1379 self.start = date_from_str(start, strict=True)
1380 else:
1381 self.start = datetime.datetime.min.date()
1382 if end is not None:
1383 self.end = date_from_str(end, strict=True)
1384 else:
1385 self.end = datetime.datetime.max.date()
1386 if self.start > self.end:
1387 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1388
1389 @classmethod
1390 def day(cls, day):
1391 """Returns a range that only contains the given day"""
1392 return cls(day, day)
1393
1394 def __contains__(self, date):
1395 """Check if the date is in the range"""
1396 if not isinstance(date, datetime.date):
1397 date = date_from_str(date)
1398 return self.start <= date <= self.end
1399
1400 def __repr__(self):
1401 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1402
1403 def __eq__(self, other):
1404 return (isinstance(other, DateRange)
1405 and self.start == other.start and self.end == other.end)
1406
1407
1408 @functools.cache
1409 def system_identifier():
1410 python_implementation = platform.python_implementation()
1411 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1412 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1413 libc_ver = []
1414 with contextlib.suppress(OSError): # We may not have access to the executable
1415 libc_ver = platform.libc_ver()
1416
1417 return 'Python %s (%s %s %s) - %s (%s%s)' % (
1418 platform.python_version(),
1419 python_implementation,
1420 platform.machine(),
1421 platform.architecture()[0],
1422 platform.platform(),
1423 ssl.OPENSSL_VERSION,
1424 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1425 )
1426
1427
1428 @functools.cache
1429 def get_windows_version():
1430 ''' Get Windows version. returns () if it's not running on Windows '''
1431 if compat_os_name == 'nt':
1432 return version_tuple(platform.win32_ver()[1])
1433 else:
1434 return ()
1435
1436
1437 def write_string(s, out=None, encoding=None):
1438 assert isinstance(s, str)
1439 out = out or sys.stderr
1440 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1441 if not out:
1442 return
1443
1444 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1445 s = re.sub(r'([\r\n]+)', r' \1', s)
1446
1447 enc, buffer = None, out
1448 if 'b' in getattr(out, 'mode', ''):
1449 enc = encoding or preferredencoding()
1450 elif hasattr(out, 'buffer'):
1451 buffer = out.buffer
1452 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1453
1454 buffer.write(s.encode(enc, 'ignore') if enc else s)
1455 out.flush()
1456
1457
1458 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1459 from .. import _IN_CLI
1460 if _IN_CLI:
1461 if msg in deprecation_warning._cache:
1462 return
1463 deprecation_warning._cache.add(msg)
1464 if printer:
1465 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1466 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1467 else:
1468 import warnings
1469 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1470
1471
1472 deprecation_warning._cache = set()
1473
1474
1475 def bytes_to_intlist(bs):
1476 if not bs:
1477 return []
1478 if isinstance(bs[0], int): # Python 3
1479 return list(bs)
1480 else:
1481 return [ord(c) for c in bs]
1482
1483
1484 def intlist_to_bytes(xs):
1485 if not xs:
1486 return b''
1487 return struct.pack('%dB' % len(xs), *xs)
1488
1489
1490 class LockingUnsupportedError(OSError):
1491 msg = 'File locking is not supported'
1492
1493 def __init__(self):
1494 super().__init__(self.msg)
1495
1496
1497 # Cross-platform file locking
1498 if sys.platform == 'win32':
1499 import ctypes
1500 import ctypes.wintypes
1501 import msvcrt
1502
1503 class OVERLAPPED(ctypes.Structure):
1504 _fields_ = [
1505 ('Internal', ctypes.wintypes.LPVOID),
1506 ('InternalHigh', ctypes.wintypes.LPVOID),
1507 ('Offset', ctypes.wintypes.DWORD),
1508 ('OffsetHigh', ctypes.wintypes.DWORD),
1509 ('hEvent', ctypes.wintypes.HANDLE),
1510 ]
1511
1512 kernel32 = ctypes.WinDLL('kernel32')
1513 LockFileEx = kernel32.LockFileEx
1514 LockFileEx.argtypes = [
1515 ctypes.wintypes.HANDLE, # hFile
1516 ctypes.wintypes.DWORD, # dwFlags
1517 ctypes.wintypes.DWORD, # dwReserved
1518 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1519 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1520 ctypes.POINTER(OVERLAPPED) # Overlapped
1521 ]
1522 LockFileEx.restype = ctypes.wintypes.BOOL
1523 UnlockFileEx = kernel32.UnlockFileEx
1524 UnlockFileEx.argtypes = [
1525 ctypes.wintypes.HANDLE, # hFile
1526 ctypes.wintypes.DWORD, # dwReserved
1527 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1528 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1529 ctypes.POINTER(OVERLAPPED) # Overlapped
1530 ]
1531 UnlockFileEx.restype = ctypes.wintypes.BOOL
1532 whole_low = 0xffffffff
1533 whole_high = 0x7fffffff
1534
1535 def _lock_file(f, exclusive, block):
1536 overlapped = OVERLAPPED()
1537 overlapped.Offset = 0
1538 overlapped.OffsetHigh = 0
1539 overlapped.hEvent = 0
1540 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1541
1542 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1543 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1544 0, whole_low, whole_high, f._lock_file_overlapped_p):
1545 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1546 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1547
1548 def _unlock_file(f):
1549 assert f._lock_file_overlapped_p
1550 handle = msvcrt.get_osfhandle(f.fileno())
1551 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1552 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1553
1554 else:
1555 try:
1556 import fcntl
1557
1558 def _lock_file(f, exclusive, block):
1559 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1560 if not block:
1561 flags |= fcntl.LOCK_NB
1562 try:
1563 fcntl.flock(f, flags)
1564 except BlockingIOError:
1565 raise
1566 except OSError: # AOSP does not have flock()
1567 fcntl.lockf(f, flags)
1568
1569 def _unlock_file(f):
1570 with contextlib.suppress(OSError):
1571 return fcntl.flock(f, fcntl.LOCK_UN)
1572 with contextlib.suppress(OSError):
1573 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
1574 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
1575
1576 except ImportError:
1577
1578 def _lock_file(f, exclusive, block):
1579 raise LockingUnsupportedError()
1580
1581 def _unlock_file(f):
1582 raise LockingUnsupportedError()
1583
1584
1585 class locked_file:
1586 locked = False
1587
1588 def __init__(self, filename, mode, block=True, encoding=None):
1589 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1590 raise NotImplementedError(mode)
1591 self.mode, self.block = mode, block
1592
1593 writable = any(f in mode for f in 'wax+')
1594 readable = any(f in mode for f in 'r+')
1595 flags = functools.reduce(operator.ior, (
1596 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1597 getattr(os, 'O_BINARY', 0), # Windows only
1598 getattr(os, 'O_NOINHERIT', 0), # Windows only
1599 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1600 os.O_APPEND if 'a' in mode else 0,
1601 os.O_EXCL if 'x' in mode else 0,
1602 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1603 ))
1604
1605 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1606
1607 def __enter__(self):
1608 exclusive = 'r' not in self.mode
1609 try:
1610 _lock_file(self.f, exclusive, self.block)
1611 self.locked = True
1612 except OSError:
1613 self.f.close()
1614 raise
1615 if 'w' in self.mode:
1616 try:
1617 self.f.truncate()
1618 except OSError as e:
1619 if e.errno not in (
1620 errno.ESPIPE, # Illegal seek - expected for FIFO
1621 errno.EINVAL, # Invalid argument - expected for /dev/null
1622 ):
1623 raise
1624 return self
1625
1626 def unlock(self):
1627 if not self.locked:
1628 return
1629 try:
1630 _unlock_file(self.f)
1631 finally:
1632 self.locked = False
1633
1634 def __exit__(self, *_):
1635 try:
1636 self.unlock()
1637 finally:
1638 self.f.close()
1639
1640 open = __enter__
1641 close = __exit__
1642
1643 def __getattr__(self, attr):
1644 return getattr(self.f, attr)
1645
1646 def __iter__(self):
1647 return iter(self.f)
1648
1649
1650 @functools.cache
1651 def get_filesystem_encoding():
1652 encoding = sys.getfilesystemencoding()
1653 return encoding if encoding is not None else 'utf-8'
1654
1655
1656 def shell_quote(args):
1657 quoted_args = []
1658 encoding = get_filesystem_encoding()
1659 for a in args:
1660 if isinstance(a, bytes):
1661 # We may get a filename encoded with 'encodeFilename'
1662 a = a.decode(encoding)
1663 quoted_args.append(compat_shlex_quote(a))
1664 return ' '.join(quoted_args)
1665
1666
1667 def smuggle_url(url, data):
1668 """ Pass additional data in a URL for internal use. """
1669
1670 url, idata = unsmuggle_url(url, {})
1671 data.update(idata)
1672 sdata = urllib.parse.urlencode(
1673 {'__youtubedl_smuggle': json.dumps(data)})
1674 return url + '#' + sdata
1675
1676
1677 def unsmuggle_url(smug_url, default=None):
1678 if '#__youtubedl_smuggle' not in smug_url:
1679 return smug_url, default
1680 url, _, sdata = smug_url.rpartition('#')
1681 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
1682 data = json.loads(jsond)
1683 return url, data
1684
1685
1686 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
1687 """ Formats numbers with decimal sufixes like K, M, etc """
1688 num, factor = float_or_none(num), float(factor)
1689 if num is None or num < 0:
1690 return None
1691 POSSIBLE_SUFFIXES = 'kMGTPEZY'
1692 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
1693 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
1694 if factor == 1024:
1695 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
1696 converted = num / (factor ** exponent)
1697 return fmt % (converted, suffix)
1698
1699
1700 def format_bytes(bytes):
1701 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
1702
1703
1704 def lookup_unit_table(unit_table, s, strict=False):
1705 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
1706 units_re = '|'.join(re.escape(u) for u in unit_table)
1707 m = (re.fullmatch if strict else re.match)(
1708 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
1709 if not m:
1710 return None
1711
1712 num = float(m.group('num').replace(',', '.'))
1713 mult = unit_table[m.group('unit')]
1714 return round(num * mult)
1715
1716
1717 def parse_bytes(s):
1718 """Parse a string indicating a byte quantity into an integer"""
1719 return lookup_unit_table(
1720 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
1721 s.upper(), strict=True)
1722
1723
1724 def parse_filesize(s):
1725 if s is None:
1726 return None
1727
1728 # The lower-case forms are of course incorrect and unofficial,
1729 # but we support those too
1730 _UNIT_TABLE = {
1731 'B': 1,
1732 'b': 1,
1733 'bytes': 1,
1734 'KiB': 1024,
1735 'KB': 1000,
1736 'kB': 1024,
1737 'Kb': 1000,
1738 'kb': 1000,
1739 'kilobytes': 1000,
1740 'kibibytes': 1024,
1741 'MiB': 1024 ** 2,
1742 'MB': 1000 ** 2,
1743 'mB': 1024 ** 2,
1744 'Mb': 1000 ** 2,
1745 'mb': 1000 ** 2,
1746 'megabytes': 1000 ** 2,
1747 'mebibytes': 1024 ** 2,
1748 'GiB': 1024 ** 3,
1749 'GB': 1000 ** 3,
1750 'gB': 1024 ** 3,
1751 'Gb': 1000 ** 3,
1752 'gb': 1000 ** 3,
1753 'gigabytes': 1000 ** 3,
1754 'gibibytes': 1024 ** 3,
1755 'TiB': 1024 ** 4,
1756 'TB': 1000 ** 4,
1757 'tB': 1024 ** 4,
1758 'Tb': 1000 ** 4,
1759 'tb': 1000 ** 4,
1760 'terabytes': 1000 ** 4,
1761 'tebibytes': 1024 ** 4,
1762 'PiB': 1024 ** 5,
1763 'PB': 1000 ** 5,
1764 'pB': 1024 ** 5,
1765 'Pb': 1000 ** 5,
1766 'pb': 1000 ** 5,
1767 'petabytes': 1000 ** 5,
1768 'pebibytes': 1024 ** 5,
1769 'EiB': 1024 ** 6,
1770 'EB': 1000 ** 6,
1771 'eB': 1024 ** 6,
1772 'Eb': 1000 ** 6,
1773 'eb': 1000 ** 6,
1774 'exabytes': 1000 ** 6,
1775 'exbibytes': 1024 ** 6,
1776 'ZiB': 1024 ** 7,
1777 'ZB': 1000 ** 7,
1778 'zB': 1024 ** 7,
1779 'Zb': 1000 ** 7,
1780 'zb': 1000 ** 7,
1781 'zettabytes': 1000 ** 7,
1782 'zebibytes': 1024 ** 7,
1783 'YiB': 1024 ** 8,
1784 'YB': 1000 ** 8,
1785 'yB': 1024 ** 8,
1786 'Yb': 1000 ** 8,
1787 'yb': 1000 ** 8,
1788 'yottabytes': 1000 ** 8,
1789 'yobibytes': 1024 ** 8,
1790 }
1791
1792 return lookup_unit_table(_UNIT_TABLE, s)
1793
1794
1795 def parse_count(s):
1796 if s is None:
1797 return None
1798
1799 s = re.sub(r'^[^\d]+\s', '', s).strip()
1800
1801 if re.match(r'^[\d,.]+$', s):
1802 return str_to_int(s)
1803
1804 _UNIT_TABLE = {
1805 'k': 1000,
1806 'K': 1000,
1807 'm': 1000 ** 2,
1808 'M': 1000 ** 2,
1809 'kk': 1000 ** 2,
1810 'KK': 1000 ** 2,
1811 'b': 1000 ** 3,
1812 'B': 1000 ** 3,
1813 }
1814
1815 ret = lookup_unit_table(_UNIT_TABLE, s)
1816 if ret is not None:
1817 return ret
1818
1819 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
1820 if mobj:
1821 return str_to_int(mobj.group(1))
1822
1823
1824 def parse_resolution(s, *, lenient=False):
1825 if s is None:
1826 return {}
1827
1828 if lenient:
1829 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
1830 else:
1831 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
1832 if mobj:
1833 return {
1834 'width': int(mobj.group('w')),
1835 'height': int(mobj.group('h')),
1836 }
1837
1838 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
1839 if mobj:
1840 return {'height': int(mobj.group(1))}
1841
1842 mobj = re.search(r'\b([48])[kK]\b', s)
1843 if mobj:
1844 return {'height': int(mobj.group(1)) * 540}
1845
1846 return {}
1847
1848
1849 def parse_bitrate(s):
1850 if not isinstance(s, str):
1851 return
1852 mobj = re.search(r'\b(\d+)\s*kbps', s)
1853 if mobj:
1854 return int(mobj.group(1))
1855
1856
1857 def month_by_name(name, lang='en'):
1858 """ Return the number of a month by (locale-independently) English name """
1859
1860 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1861
1862 try:
1863 return month_names.index(name) + 1
1864 except ValueError:
1865 return None
1866
1867
1868 def month_by_abbreviation(abbrev):
1869 """ Return the number of a month by (locale-independently) English
1870 abbreviations """
1871
1872 try:
1873 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1874 except ValueError:
1875 return None
1876
1877
1878 def fix_xml_ampersands(xml_str):
1879 """Replace all the '&' by '&amp;' in XML"""
1880 return re.sub(
1881 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1882 '&amp;',
1883 xml_str)
1884
1885
1886 def setproctitle(title):
1887 assert isinstance(title, str)
1888
1889 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
1890 try:
1891 import ctypes
1892 except ImportError:
1893 return
1894
1895 try:
1896 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1897 except OSError:
1898 return
1899 except TypeError:
1900 # LoadLibrary in Windows Python 2.7.13 only expects
1901 # a bytestring, but since unicode_literals turns
1902 # every string into a unicode string, it fails.
1903 return
1904 title_bytes = title.encode()
1905 buf = ctypes.create_string_buffer(len(title_bytes))
1906 buf.value = title_bytes
1907 try:
1908 libc.prctl(15, buf, 0, 0, 0)
1909 except AttributeError:
1910 return # Strange libc, just skip this
1911
1912
1913 def remove_start(s, start):
1914 return s[len(start):] if s is not None and s.startswith(start) else s
1915
1916
1917 def remove_end(s, end):
1918 return s[:-len(end)] if s is not None and s.endswith(end) else s
1919
1920
1921 def remove_quotes(s):
1922 if s is None or len(s) < 2:
1923 return s
1924 for quote in ('"', "'", ):
1925 if s[0] == quote and s[-1] == quote:
1926 return s[1:-1]
1927 return s
1928
1929
1930 def get_domain(url):
1931 """
1932 This implementation is inconsistent, but is kept for compatibility.
1933 Use this only for "webpage_url_domain"
1934 """
1935 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
1936
1937
1938 def url_basename(url):
1939 path = urllib.parse.urlparse(url).path
1940 return path.strip('/').split('/')[-1]
1941
1942
1943 def base_url(url):
1944 return re.match(r'https?://[^?#]+/', url).group()
1945
1946
1947 def urljoin(base, path):
1948 if isinstance(path, bytes):
1949 path = path.decode()
1950 if not isinstance(path, str) or not path:
1951 return None
1952 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
1953 return path
1954 if isinstance(base, bytes):
1955 base = base.decode()
1956 if not isinstance(base, str) or not re.match(
1957 r'^(?:https?:)?//', base):
1958 return None
1959 return urllib.parse.urljoin(base, path)
1960
1961
1962 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1963 if get_attr and v is not None:
1964 v = getattr(v, get_attr, None)
1965 try:
1966 return int(v) * invscale // scale
1967 except (ValueError, TypeError, OverflowError):
1968 return default
1969
1970
1971 def str_or_none(v, default=None):
1972 return default if v is None else str(v)
1973
1974
1975 def str_to_int(int_str):
1976 """ A more relaxed version of int_or_none """
1977 if isinstance(int_str, int):
1978 return int_str
1979 elif isinstance(int_str, str):
1980 int_str = re.sub(r'[,\.\+]', '', int_str)
1981 return int_or_none(int_str)
1982
1983
1984 def float_or_none(v, scale=1, invscale=1, default=None):
1985 if v is None:
1986 return default
1987 try:
1988 return float(v) * invscale / scale
1989 except (ValueError, TypeError):
1990 return default
1991
1992
1993 def bool_or_none(v, default=None):
1994 return v if isinstance(v, bool) else default
1995
1996
1997 def strip_or_none(v, default=None):
1998 return v.strip() if isinstance(v, str) else default
1999
2000
2001 def url_or_none(url):
2002 if not url or not isinstance(url, str):
2003 return None
2004 url = url.strip()
2005 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2006
2007
2008 def request_to_url(req):
2009 if isinstance(req, urllib.request.Request):
2010 return req.get_full_url()
2011 else:
2012 return req
2013
2014
2015 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2016 datetime_object = None
2017 try:
2018 if isinstance(timestamp, (int, float)): # unix timestamp
2019 # Using naive datetime here can break timestamp() in Windows
2020 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2021 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2022 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2023 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2024 + datetime.timedelta(seconds=timestamp))
2025 elif isinstance(timestamp, str): # assume YYYYMMDD
2026 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2027 date_format = re.sub( # Support %s on windows
2028 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2029 return datetime_object.strftime(date_format)
2030 except (ValueError, TypeError, AttributeError):
2031 return default
2032
2033
2034 def parse_duration(s):
2035 if not isinstance(s, str):
2036 return None
2037 s = s.strip()
2038 if not s:
2039 return None
2040
2041 days, hours, mins, secs, ms = [None] * 5
2042 m = re.match(r'''(?x)
2043 (?P<before_secs>
2044 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2045 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2046 (?P<ms>[.:][0-9]+)?Z?$
2047 ''', s)
2048 if m:
2049 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2050 else:
2051 m = re.match(
2052 r'''(?ix)(?:P?
2053 (?:
2054 [0-9]+\s*y(?:ears?)?,?\s*
2055 )?
2056 (?:
2057 [0-9]+\s*m(?:onths?)?,?\s*
2058 )?
2059 (?:
2060 [0-9]+\s*w(?:eeks?)?,?\s*
2061 )?
2062 (?:
2063 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2064 )?
2065 T)?
2066 (?:
2067 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2068 )?
2069 (?:
2070 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2071 )?
2072 (?:
2073 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2074 )?Z?$''', s)
2075 if m:
2076 days, hours, mins, secs, ms = m.groups()
2077 else:
2078 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2079 if m:
2080 hours, mins = m.groups()
2081 else:
2082 return None
2083
2084 if ms:
2085 ms = ms.replace(':', '.')
2086 return sum(float(part or 0) * mult for part, mult in (
2087 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2088
2089
2090 def prepend_extension(filename, ext, expected_real_ext=None):
2091 name, real_ext = os.path.splitext(filename)
2092 return (
2093 f'{name}.{ext}{real_ext}'
2094 if not expected_real_ext or real_ext[1:] == expected_real_ext
2095 else f'{filename}.{ext}')
2096
2097
2098 def replace_extension(filename, ext, expected_real_ext=None):
2099 name, real_ext = os.path.splitext(filename)
2100 return '{}.{}'.format(
2101 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2102 ext)
2103
2104
2105 def check_executable(exe, args=[]):
2106 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2107 args can be a list of arguments for a short output (like -version) """
2108 try:
2109 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2110 except OSError:
2111 return False
2112 return exe
2113
2114
2115 def _get_exe_version_output(exe, args):
2116 try:
2117 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2118 # SIGTTOU if yt-dlp is run in the background.
2119 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2120 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2121 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2122 if ret:
2123 return None
2124 except OSError:
2125 return False
2126 return stdout
2127
2128
2129 def detect_exe_version(output, version_re=None, unrecognized='present'):
2130 assert isinstance(output, str)
2131 if version_re is None:
2132 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2133 m = re.search(version_re, output)
2134 if m:
2135 return m.group(1)
2136 else:
2137 return unrecognized
2138
2139
2140 def get_exe_version(exe, args=['--version'],
2141 version_re=None, unrecognized=('present', 'broken')):
2142 """ Returns the version of the specified executable,
2143 or False if the executable is not present """
2144 unrecognized = variadic(unrecognized)
2145 assert len(unrecognized) in (1, 2)
2146 out = _get_exe_version_output(exe, args)
2147 if out is None:
2148 return unrecognized[-1]
2149 return out and detect_exe_version(out, version_re, unrecognized[0])
2150
2151
2152 def frange(start=0, stop=None, step=1):
2153 """Float range"""
2154 if stop is None:
2155 start, stop = 0, start
2156 sign = [-1, 1][step > 0] if step else 0
2157 while sign * start < sign * stop:
2158 yield start
2159 start += step
2160
2161
2162 class LazyList(collections.abc.Sequence):
2163 """Lazy immutable list from an iterable
2164 Note that slices of a LazyList are lists and not LazyList"""
2165
2166 class IndexError(IndexError):
2167 pass
2168
2169 def __init__(self, iterable, *, reverse=False, _cache=None):
2170 self._iterable = iter(iterable)
2171 self._cache = [] if _cache is None else _cache
2172 self._reversed = reverse
2173
2174 def __iter__(self):
2175 if self._reversed:
2176 # We need to consume the entire iterable to iterate in reverse
2177 yield from self.exhaust()
2178 return
2179 yield from self._cache
2180 for item in self._iterable:
2181 self._cache.append(item)
2182 yield item
2183
2184 def _exhaust(self):
2185 self._cache.extend(self._iterable)
2186 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2187 return self._cache
2188
2189 def exhaust(self):
2190 """Evaluate the entire iterable"""
2191 return self._exhaust()[::-1 if self._reversed else 1]
2192
2193 @staticmethod
2194 def _reverse_index(x):
2195 return None if x is None else ~x
2196
2197 def __getitem__(self, idx):
2198 if isinstance(idx, slice):
2199 if self._reversed:
2200 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2201 start, stop, step = idx.start, idx.stop, idx.step or 1
2202 elif isinstance(idx, int):
2203 if self._reversed:
2204 idx = self._reverse_index(idx)
2205 start, stop, step = idx, idx, 0
2206 else:
2207 raise TypeError('indices must be integers or slices')
2208 if ((start or 0) < 0 or (stop or 0) < 0
2209 or (start is None and step < 0)
2210 or (stop is None and step > 0)):
2211 # We need to consume the entire iterable to be able to slice from the end
2212 # Obviously, never use this with infinite iterables
2213 self._exhaust()
2214 try:
2215 return self._cache[idx]
2216 except IndexError as e:
2217 raise self.IndexError(e) from e
2218 n = max(start or 0, stop or 0) - len(self._cache) + 1
2219 if n > 0:
2220 self._cache.extend(itertools.islice(self._iterable, n))
2221 try:
2222 return self._cache[idx]
2223 except IndexError as e:
2224 raise self.IndexError(e) from e
2225
2226 def __bool__(self):
2227 try:
2228 self[-1] if self._reversed else self[0]
2229 except self.IndexError:
2230 return False
2231 return True
2232
2233 def __len__(self):
2234 self._exhaust()
2235 return len(self._cache)
2236
2237 def __reversed__(self):
2238 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2239
2240 def __copy__(self):
2241 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2242
2243 def __repr__(self):
2244 # repr and str should mimic a list. So we exhaust the iterable
2245 return repr(self.exhaust())
2246
2247 def __str__(self):
2248 return repr(self.exhaust())
2249
2250
2251 class PagedList:
2252
2253 class IndexError(IndexError):
2254 pass
2255
2256 def __len__(self):
2257 # This is only useful for tests
2258 return len(self.getslice())
2259
2260 def __init__(self, pagefunc, pagesize, use_cache=True):
2261 self._pagefunc = pagefunc
2262 self._pagesize = pagesize
2263 self._pagecount = float('inf')
2264 self._use_cache = use_cache
2265 self._cache = {}
2266
2267 def getpage(self, pagenum):
2268 page_results = self._cache.get(pagenum)
2269 if page_results is None:
2270 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2271 if self._use_cache:
2272 self._cache[pagenum] = page_results
2273 return page_results
2274
2275 def getslice(self, start=0, end=None):
2276 return list(self._getslice(start, end))
2277
2278 def _getslice(self, start, end):
2279 raise NotImplementedError('This method must be implemented by subclasses')
2280
2281 def __getitem__(self, idx):
2282 assert self._use_cache, 'Indexing PagedList requires cache'
2283 if not isinstance(idx, int) or idx < 0:
2284 raise TypeError('indices must be non-negative integers')
2285 entries = self.getslice(idx, idx + 1)
2286 if not entries:
2287 raise self.IndexError()
2288 return entries[0]
2289
2290
2291 class OnDemandPagedList(PagedList):
2292 """Download pages until a page with less than maximum results"""
2293
2294 def _getslice(self, start, end):
2295 for pagenum in itertools.count(start // self._pagesize):
2296 firstid = pagenum * self._pagesize
2297 nextfirstid = pagenum * self._pagesize + self._pagesize
2298 if start >= nextfirstid:
2299 continue
2300
2301 startv = (
2302 start % self._pagesize
2303 if firstid <= start < nextfirstid
2304 else 0)
2305 endv = (
2306 ((end - 1) % self._pagesize) + 1
2307 if (end is not None and firstid <= end <= nextfirstid)
2308 else None)
2309
2310 try:
2311 page_results = self.getpage(pagenum)
2312 except Exception:
2313 self._pagecount = pagenum - 1
2314 raise
2315 if startv != 0 or endv is not None:
2316 page_results = page_results[startv:endv]
2317 yield from page_results
2318
2319 # A little optimization - if current page is not "full", ie. does
2320 # not contain page_size videos then we can assume that this page
2321 # is the last one - there are no more ids on further pages -
2322 # i.e. no need to query again.
2323 if len(page_results) + startv < self._pagesize:
2324 break
2325
2326 # If we got the whole page, but the next page is not interesting,
2327 # break out early as well
2328 if end == nextfirstid:
2329 break
2330
2331
2332 class InAdvancePagedList(PagedList):
2333 """PagedList with total number of pages known in advance"""
2334
2335 def __init__(self, pagefunc, pagecount, pagesize):
2336 PagedList.__init__(self, pagefunc, pagesize, True)
2337 self._pagecount = pagecount
2338
2339 def _getslice(self, start, end):
2340 start_page = start // self._pagesize
2341 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2342 skip_elems = start - start_page * self._pagesize
2343 only_more = None if end is None else end - start
2344 for pagenum in range(start_page, end_page):
2345 page_results = self.getpage(pagenum)
2346 if skip_elems:
2347 page_results = page_results[skip_elems:]
2348 skip_elems = None
2349 if only_more is not None:
2350 if len(page_results) < only_more:
2351 only_more -= len(page_results)
2352 else:
2353 yield from page_results[:only_more]
2354 break
2355 yield from page_results
2356
2357
2358 class PlaylistEntries:
2359 MissingEntry = object()
2360 is_exhausted = False
2361
2362 def __init__(self, ydl, info_dict):
2363 self.ydl = ydl
2364
2365 # _entries must be assigned now since infodict can change during iteration
2366 entries = info_dict.get('entries')
2367 if entries is None:
2368 raise EntryNotInPlaylist('There are no entries')
2369 elif isinstance(entries, list):
2370 self.is_exhausted = True
2371
2372 requested_entries = info_dict.get('requested_entries')
2373 self.is_incomplete = requested_entries is not None
2374 if self.is_incomplete:
2375 assert self.is_exhausted
2376 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2377 for i, entry in zip(requested_entries, entries):
2378 self._entries[i - 1] = entry
2379 elif isinstance(entries, (list, PagedList, LazyList)):
2380 self._entries = entries
2381 else:
2382 self._entries = LazyList(entries)
2383
2384 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2385 (?P<start>[+-]?\d+)?
2386 (?P<range>[:-]
2387 (?P<end>[+-]?\d+|inf(?:inite)?)?
2388 (?::(?P<step>[+-]?\d+))?
2389 )?''')
2390
2391 @classmethod
2392 def parse_playlist_items(cls, string):
2393 for segment in string.split(','):
2394 if not segment:
2395 raise ValueError('There is two or more consecutive commas')
2396 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2397 if not mobj:
2398 raise ValueError(f'{segment!r} is not a valid specification')
2399 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2400 if int_or_none(step) == 0:
2401 raise ValueError(f'Step in {segment!r} cannot be zero')
2402 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2403
2404 def get_requested_items(self):
2405 playlist_items = self.ydl.params.get('playlist_items')
2406 playlist_start = self.ydl.params.get('playliststart', 1)
2407 playlist_end = self.ydl.params.get('playlistend')
2408 # For backwards compatibility, interpret -1 as whole list
2409 if playlist_end in (-1, None):
2410 playlist_end = ''
2411 if not playlist_items:
2412 playlist_items = f'{playlist_start}:{playlist_end}'
2413 elif playlist_start != 1 or playlist_end:
2414 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2415
2416 for index in self.parse_playlist_items(playlist_items):
2417 for i, entry in self[index]:
2418 yield i, entry
2419 if not entry:
2420 continue
2421 try:
2422 # The item may have just been added to archive. Don't break due to it
2423 if not self.ydl.params.get('lazy_playlist'):
2424 # TODO: Add auto-generated fields
2425 self.ydl._match_entry(entry, incomplete=True, silent=True)
2426 except (ExistingVideoReached, RejectedVideoReached):
2427 return
2428
2429 def get_full_count(self):
2430 if self.is_exhausted and not self.is_incomplete:
2431 return len(self)
2432 elif isinstance(self._entries, InAdvancePagedList):
2433 if self._entries._pagesize == 1:
2434 return self._entries._pagecount
2435
2436 @functools.cached_property
2437 def _getter(self):
2438 if isinstance(self._entries, list):
2439 def get_entry(i):
2440 try:
2441 entry = self._entries[i]
2442 except IndexError:
2443 entry = self.MissingEntry
2444 if not self.is_incomplete:
2445 raise self.IndexError()
2446 if entry is self.MissingEntry:
2447 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2448 return entry
2449 else:
2450 def get_entry(i):
2451 try:
2452 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2453 except (LazyList.IndexError, PagedList.IndexError):
2454 raise self.IndexError()
2455 return get_entry
2456
2457 def __getitem__(self, idx):
2458 if isinstance(idx, int):
2459 idx = slice(idx, idx)
2460
2461 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2462 step = 1 if idx.step is None else idx.step
2463 if idx.start is None:
2464 start = 0 if step > 0 else len(self) - 1
2465 else:
2466 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2467
2468 # NB: Do not call len(self) when idx == [:]
2469 if idx.stop is None:
2470 stop = 0 if step < 0 else float('inf')
2471 else:
2472 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2473 stop += [-1, 1][step > 0]
2474
2475 for i in frange(start, stop, step):
2476 if i < 0:
2477 continue
2478 try:
2479 entry = self._getter(i)
2480 except self.IndexError:
2481 self.is_exhausted = True
2482 if step > 0:
2483 break
2484 continue
2485 yield i + 1, entry
2486
2487 def __len__(self):
2488 return len(tuple(self[:]))
2489
2490 class IndexError(IndexError):
2491 pass
2492
2493
2494 def uppercase_escape(s):
2495 unicode_escape = codecs.getdecoder('unicode_escape')
2496 return re.sub(
2497 r'\\U[0-9a-fA-F]{8}',
2498 lambda m: unicode_escape(m.group(0))[0],
2499 s)
2500
2501
2502 def lowercase_escape(s):
2503 unicode_escape = codecs.getdecoder('unicode_escape')
2504 return re.sub(
2505 r'\\u[0-9a-fA-F]{4}',
2506 lambda m: unicode_escape(m.group(0))[0],
2507 s)
2508
2509
2510 def escape_rfc3986(s):
2511 """Escape non-ASCII characters as suggested by RFC 3986"""
2512 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2513
2514
2515 def escape_url(url):
2516 """Escape URL as suggested by RFC 3986"""
2517 url_parsed = urllib.parse.urlparse(url)
2518 return url_parsed._replace(
2519 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2520 path=escape_rfc3986(url_parsed.path),
2521 params=escape_rfc3986(url_parsed.params),
2522 query=escape_rfc3986(url_parsed.query),
2523 fragment=escape_rfc3986(url_parsed.fragment)
2524 ).geturl()
2525
2526
2527 def parse_qs(url, **kwargs):
2528 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2529
2530
2531 def read_batch_urls(batch_fd):
2532 def fixup(url):
2533 if not isinstance(url, str):
2534 url = url.decode('utf-8', 'replace')
2535 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2536 for bom in BOM_UTF8:
2537 if url.startswith(bom):
2538 url = url[len(bom):]
2539 url = url.lstrip()
2540 if not url or url.startswith(('#', ';', ']')):
2541 return False
2542 # "#" cannot be stripped out since it is part of the URI
2543 # However, it can be safely stripped out if following a whitespace
2544 return re.split(r'\s#', url, 1)[0].rstrip()
2545
2546 with contextlib.closing(batch_fd) as fd:
2547 return [url for url in map(fixup, fd) if url]
2548
2549
2550 def urlencode_postdata(*args, **kargs):
2551 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2552
2553
2554 def update_url(url, *, query_update=None, **kwargs):
2555 """Replace URL components specified by kwargs
2556 @param url str or parse url tuple
2557 @param query_update update query
2558 @returns str
2559 """
2560 if isinstance(url, str):
2561 if not kwargs and not query_update:
2562 return url
2563 else:
2564 url = urllib.parse.urlparse(url)
2565 if query_update:
2566 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
2567 kwargs['query'] = urllib.parse.urlencode({
2568 **urllib.parse.parse_qs(url.query),
2569 **query_update
2570 }, True)
2571 return urllib.parse.urlunparse(url._replace(**kwargs))
2572
2573
2574 def update_url_query(url, query):
2575 return update_url(url, query_update=query)
2576
2577
2578 def _multipart_encode_impl(data, boundary):
2579 content_type = 'multipart/form-data; boundary=%s' % boundary
2580
2581 out = b''
2582 for k, v in data.items():
2583 out += b'--' + boundary.encode('ascii') + b'\r\n'
2584 if isinstance(k, str):
2585 k = k.encode()
2586 if isinstance(v, str):
2587 v = v.encode()
2588 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2589 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2590 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2591 if boundary.encode('ascii') in content:
2592 raise ValueError('Boundary overlaps with data')
2593 out += content
2594
2595 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2596
2597 return out, content_type
2598
2599
2600 def multipart_encode(data, boundary=None):
2601 '''
2602 Encode a dict to RFC 7578-compliant form-data
2603
2604 data:
2605 A dict where keys and values can be either Unicode or bytes-like
2606 objects.
2607 boundary:
2608 If specified a Unicode object, it's used as the boundary. Otherwise
2609 a random boundary is generated.
2610
2611 Reference: https://tools.ietf.org/html/rfc7578
2612 '''
2613 has_specified_boundary = boundary is not None
2614
2615 while True:
2616 if boundary is None:
2617 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2618
2619 try:
2620 out, content_type = _multipart_encode_impl(data, boundary)
2621 break
2622 except ValueError:
2623 if has_specified_boundary:
2624 raise
2625 boundary = None
2626
2627 return out, content_type
2628
2629
2630 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
2631 if blocked_types is NO_DEFAULT:
2632 blocked_types = (str, bytes, collections.abc.Mapping)
2633 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
2634
2635
2636 def variadic(x, allowed_types=NO_DEFAULT):
2637 if not isinstance(allowed_types, (tuple, type)):
2638 deprecation_warning('allowed_types should be a tuple or a type')
2639 allowed_types = tuple(allowed_types)
2640 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
2641
2642
2643 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2644 for f in funcs:
2645 try:
2646 val = f(*args, **kwargs)
2647 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
2648 pass
2649 else:
2650 if expected_type is None or isinstance(val, expected_type):
2651 return val
2652
2653
2654 def try_get(src, getter, expected_type=None):
2655 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2656
2657
2658 def filter_dict(dct, cndn=lambda _, v: v is not None):
2659 return {k: v for k, v in dct.items() if cndn(k, v)}
2660
2661
2662 def merge_dicts(*dicts):
2663 merged = {}
2664 for a_dict in dicts:
2665 for k, v in a_dict.items():
2666 if (v is not None and k not in merged
2667 or isinstance(v, str) and merged[k] == ''):
2668 merged[k] = v
2669 return merged
2670
2671
2672 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2673 return string if isinstance(string, str) else str(string, encoding, errors)
2674
2675
2676 US_RATINGS = {
2677 'G': 0,
2678 'PG': 10,
2679 'PG-13': 13,
2680 'R': 16,
2681 'NC': 18,
2682 }
2683
2684
2685 TV_PARENTAL_GUIDELINES = {
2686 'TV-Y': 0,
2687 'TV-Y7': 7,
2688 'TV-G': 0,
2689 'TV-PG': 0,
2690 'TV-14': 14,
2691 'TV-MA': 17,
2692 }
2693
2694
2695 def parse_age_limit(s):
2696 # isinstance(False, int) is True. So type() must be used instead
2697 if type(s) is int: # noqa: E721
2698 return s if 0 <= s <= 21 else None
2699 elif not isinstance(s, str):
2700 return None
2701 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2702 if m:
2703 return int(m.group('age'))
2704 s = s.upper()
2705 if s in US_RATINGS:
2706 return US_RATINGS[s]
2707 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2708 if m:
2709 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2710 return None
2711
2712
2713 def strip_jsonp(code):
2714 return re.sub(
2715 r'''(?sx)^
2716 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2717 (?:\s*&&\s*(?P=func_name))?
2718 \s*\(\s*(?P<callback_data>.*)\);?
2719 \s*?(?://[^\n]*)*$''',
2720 r'\g<callback_data>', code)
2721
2722
2723 def js_to_json(code, vars={}, *, strict=False):
2724 # vars is a dict of var, val pairs to substitute
2725 STRING_QUOTES = '\'"`'
2726 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
2727 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2728 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2729 INTEGER_TABLE = (
2730 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2731 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2732 )
2733
2734 def process_escape(match):
2735 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
2736 escape = match.group(1) or match.group(2)
2737
2738 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
2739 else R'\u00' if escape == 'x'
2740 else '' if escape == '\n'
2741 else escape)
2742
2743 def template_substitute(match):
2744 evaluated = js_to_json(match.group(1), vars, strict=strict)
2745 if evaluated[0] == '"':
2746 return json.loads(evaluated)
2747 return evaluated
2748
2749 def fix_kv(m):
2750 v = m.group(0)
2751 if v in ('true', 'false', 'null'):
2752 return v
2753 elif v in ('undefined', 'void 0'):
2754 return 'null'
2755 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2756 return ''
2757
2758 if v[0] in STRING_QUOTES:
2759 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
2760 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
2761 return f'"{escaped}"'
2762
2763 for regex, base in INTEGER_TABLE:
2764 im = re.match(regex, v)
2765 if im:
2766 i = int(im.group(1), base)
2767 return f'"{i}":' if v.endswith(':') else str(i)
2768
2769 if v in vars:
2770 try:
2771 if not strict:
2772 json.loads(vars[v])
2773 except json.JSONDecodeError:
2774 return json.dumps(vars[v])
2775 else:
2776 return vars[v]
2777
2778 if not strict:
2779 return f'"{v}"'
2780
2781 raise ValueError(f'Unknown value: {v}')
2782
2783 def create_map(mobj):
2784 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
2785
2786 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
2787 if not strict:
2788 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
2789 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
2790 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
2791 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
2792
2793 return re.sub(rf'''(?sx)
2794 {STRING_RE}|
2795 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
2796 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2797 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
2798 [0-9]+(?={SKIP_RE}:)|
2799 !+
2800 ''', fix_kv, code)
2801
2802
2803 def qualities(quality_ids):
2804 """ Get a numeric quality value out of a list of possible values """
2805 def q(qid):
2806 try:
2807 return quality_ids.index(qid)
2808 except ValueError:
2809 return -1
2810 return q
2811
2812
2813 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
2814
2815
2816 DEFAULT_OUTTMPL = {
2817 'default': '%(title)s [%(id)s].%(ext)s',
2818 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
2819 }
2820 OUTTMPL_TYPES = {
2821 'chapter': None,
2822 'subtitle': None,
2823 'thumbnail': None,
2824 'description': 'description',
2825 'annotation': 'annotations.xml',
2826 'infojson': 'info.json',
2827 'link': None,
2828 'pl_video': None,
2829 'pl_thumbnail': None,
2830 'pl_description': 'description',
2831 'pl_infojson': 'info.json',
2832 }
2833
2834 # As of [1] format syntax is:
2835 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
2836 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
2837 STR_FORMAT_RE_TMPL = r'''(?x)
2838 (?<!%)(?P<prefix>(?:%%)*)
2839 %
2840 (?P<has_key>\((?P<key>{0})\))?
2841 (?P<format>
2842 (?P<conversion>[#0\-+ ]+)?
2843 (?P<min_width>\d+)?
2844 (?P<precision>\.\d+)?
2845 (?P<len_mod>[hlL])? # unused in python
2846 {1} # conversion type
2847 )
2848 '''
2849
2850
2851 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
2852
2853
2854 def limit_length(s, length):
2855 """ Add ellipses to overly long strings """
2856 if s is None:
2857 return None
2858 ELLIPSES = '...'
2859 if len(s) > length:
2860 return s[:length - len(ELLIPSES)] + ELLIPSES
2861 return s
2862
2863
2864 def version_tuple(v):
2865 return tuple(int(e) for e in re.split(r'[-.]', v))
2866
2867
2868 def is_outdated_version(version, limit, assume_new=True):
2869 if not version:
2870 return not assume_new
2871 try:
2872 return version_tuple(version) < version_tuple(limit)
2873 except ValueError:
2874 return not assume_new
2875
2876
2877 def ytdl_is_updateable():
2878 """ Returns if yt-dlp can be updated with -U """
2879
2880 from ..update import is_non_updateable
2881
2882 return not is_non_updateable()
2883
2884
2885 def args_to_str(args):
2886 # Get a short string representation for a subprocess command
2887 return ' '.join(compat_shlex_quote(a) for a in args)
2888
2889
2890 def error_to_str(err):
2891 return f'{type(err).__name__}: {err}'
2892
2893
2894 def mimetype2ext(mt, default=NO_DEFAULT):
2895 if not isinstance(mt, str):
2896 if default is not NO_DEFAULT:
2897 return default
2898 return None
2899
2900 MAP = {
2901 # video
2902 '3gpp': '3gp',
2903 'mp2t': 'ts',
2904 'mp4': 'mp4',
2905 'mpeg': 'mpeg',
2906 'mpegurl': 'm3u8',
2907 'quicktime': 'mov',
2908 'webm': 'webm',
2909 'vp9': 'vp9',
2910 'x-flv': 'flv',
2911 'x-m4v': 'm4v',
2912 'x-matroska': 'mkv',
2913 'x-mng': 'mng',
2914 'x-mp4-fragmented': 'mp4',
2915 'x-ms-asf': 'asf',
2916 'x-ms-wmv': 'wmv',
2917 'x-msvideo': 'avi',
2918
2919 # application (streaming playlists)
2920 'dash+xml': 'mpd',
2921 'f4m+xml': 'f4m',
2922 'hds+xml': 'f4m',
2923 'vnd.apple.mpegurl': 'm3u8',
2924 'vnd.ms-sstr+xml': 'ism',
2925 'x-mpegurl': 'm3u8',
2926
2927 # audio
2928 'audio/mp4': 'm4a',
2929 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
2930 # Using .mp3 as it's the most popular one
2931 'audio/mpeg': 'mp3',
2932 'audio/webm': 'webm',
2933 'audio/x-matroska': 'mka',
2934 'audio/x-mpegurl': 'm3u',
2935 'midi': 'mid',
2936 'ogg': 'ogg',
2937 'wav': 'wav',
2938 'wave': 'wav',
2939 'x-aac': 'aac',
2940 'x-flac': 'flac',
2941 'x-m4a': 'm4a',
2942 'x-realaudio': 'ra',
2943 'x-wav': 'wav',
2944
2945 # image
2946 'avif': 'avif',
2947 'bmp': 'bmp',
2948 'gif': 'gif',
2949 'jpeg': 'jpg',
2950 'png': 'png',
2951 'svg+xml': 'svg',
2952 'tiff': 'tif',
2953 'vnd.wap.wbmp': 'wbmp',
2954 'webp': 'webp',
2955 'x-icon': 'ico',
2956 'x-jng': 'jng',
2957 'x-ms-bmp': 'bmp',
2958
2959 # caption
2960 'filmstrip+json': 'fs',
2961 'smptett+xml': 'tt',
2962 'ttaf+xml': 'dfxp',
2963 'ttml+xml': 'ttml',
2964 'x-ms-sami': 'sami',
2965
2966 # misc
2967 'gzip': 'gz',
2968 'json': 'json',
2969 'xml': 'xml',
2970 'zip': 'zip',
2971 }
2972
2973 mimetype = mt.partition(';')[0].strip().lower()
2974 _, _, subtype = mimetype.rpartition('/')
2975
2976 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
2977 if ext:
2978 return ext
2979 elif default is not NO_DEFAULT:
2980 return default
2981 return subtype.replace('+', '.')
2982
2983
2984 def ext2mimetype(ext_or_url):
2985 if not ext_or_url:
2986 return None
2987 if '.' not in ext_or_url:
2988 ext_or_url = f'file.{ext_or_url}'
2989 return mimetypes.guess_type(ext_or_url)[0]
2990
2991
2992 def parse_codecs(codecs_str):
2993 # http://tools.ietf.org/html/rfc6381
2994 if not codecs_str:
2995 return {}
2996 split_codecs = list(filter(None, map(
2997 str.strip, codecs_str.strip().strip(',').split(','))))
2998 vcodec, acodec, scodec, hdr = None, None, None, None
2999 for full_codec in split_codecs:
3000 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3001 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3002 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3003 if vcodec:
3004 continue
3005 vcodec = full_codec
3006 if parts[0] in ('dvh1', 'dvhe'):
3007 hdr = 'DV'
3008 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3009 hdr = 'HDR10'
3010 elif parts[:2] == ['vp9', '2']:
3011 hdr = 'HDR10'
3012 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3013 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3014 acodec = acodec or full_codec
3015 elif parts[0] in ('stpp', 'wvtt'):
3016 scodec = scodec or full_codec
3017 else:
3018 write_string(f'WARNING: Unknown codec {full_codec}\n')
3019 if vcodec or acodec or scodec:
3020 return {
3021 'vcodec': vcodec or 'none',
3022 'acodec': acodec or 'none',
3023 'dynamic_range': hdr,
3024 **({'scodec': scodec} if scodec is not None else {}),
3025 }
3026 elif len(split_codecs) == 2:
3027 return {
3028 'vcodec': split_codecs[0],
3029 'acodec': split_codecs[1],
3030 }
3031 return {}
3032
3033
3034 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3035 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3036
3037 allow_mkv = not preferences or 'mkv' in preferences
3038
3039 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3040 return 'mkv' # TODO: any other format allows this?
3041
3042 # TODO: All codecs supported by parse_codecs isn't handled here
3043 COMPATIBLE_CODECS = {
3044 'mp4': {
3045 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3046 'h264', 'aacl', 'ec-3', # Set in ISM
3047 },
3048 'webm': {
3049 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3050 'vp9x', 'vp8x', # in the webm spec
3051 },
3052 }
3053
3054 sanitize_codec = functools.partial(
3055 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3056 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3057
3058 for ext in preferences or COMPATIBLE_CODECS.keys():
3059 codec_set = COMPATIBLE_CODECS.get(ext, set())
3060 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3061 return ext
3062
3063 COMPATIBLE_EXTS = (
3064 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3065 {'webm', 'weba'},
3066 )
3067 for ext in preferences or vexts:
3068 current_exts = {ext, *vexts, *aexts}
3069 if ext == 'mkv' or current_exts == {ext} or any(
3070 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3071 return ext
3072 return 'mkv' if allow_mkv else preferences[-1]
3073
3074
3075 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3076 getheader = url_handle.headers.get
3077
3078 cd = getheader('Content-Disposition')
3079 if cd:
3080 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3081 if m:
3082 e = determine_ext(m.group('filename'), default_ext=None)
3083 if e:
3084 return e
3085
3086 meta_ext = getheader('x-amz-meta-name')
3087 if meta_ext:
3088 e = meta_ext.rpartition('.')[2]
3089 if e:
3090 return e
3091
3092 return mimetype2ext(getheader('Content-Type'), default=default)
3093
3094
3095 def encode_data_uri(data, mime_type):
3096 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3097
3098
3099 def age_restricted(content_limit, age_limit):
3100 """ Returns True iff the content should be blocked """
3101
3102 if age_limit is None: # No limit set
3103 return False
3104 if content_limit is None:
3105 return False # Content available for everyone
3106 return age_limit < content_limit
3107
3108
3109 # List of known byte-order-marks (BOM)
3110 BOMS = [
3111 (b'\xef\xbb\xbf', 'utf-8'),
3112 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3113 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3114 (b'\xff\xfe', 'utf-16-le'),
3115 (b'\xfe\xff', 'utf-16-be'),
3116 ]
3117
3118
3119 def is_html(first_bytes):
3120 """ Detect whether a file contains HTML by examining its first bytes. """
3121
3122 encoding = 'utf-8'
3123 for bom, enc in BOMS:
3124 while first_bytes.startswith(bom):
3125 encoding, first_bytes = enc, first_bytes[len(bom):]
3126
3127 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3128
3129
3130 def determine_protocol(info_dict):
3131 protocol = info_dict.get('protocol')
3132 if protocol is not None:
3133 return protocol
3134
3135 url = sanitize_url(info_dict['url'])
3136 if url.startswith('rtmp'):
3137 return 'rtmp'
3138 elif url.startswith('mms'):
3139 return 'mms'
3140 elif url.startswith('rtsp'):
3141 return 'rtsp'
3142
3143 ext = determine_ext(url)
3144 if ext == 'm3u8':
3145 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3146 elif ext == 'f4m':
3147 return 'f4m'
3148
3149 return urllib.parse.urlparse(url).scheme
3150
3151
3152 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3153 """ Render a list of rows, each as a list of values.
3154 Text after a \t will be right aligned """
3155 def width(string):
3156 return len(remove_terminal_sequences(string).replace('\t', ''))
3157
3158 def get_max_lens(table):
3159 return [max(width(str(v)) for v in col) for col in zip(*table)]
3160
3161 def filter_using_list(row, filterArray):
3162 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3163
3164 max_lens = get_max_lens(data) if hide_empty else []
3165 header_row = filter_using_list(header_row, max_lens)
3166 data = [filter_using_list(row, max_lens) for row in data]
3167
3168 table = [header_row] + data
3169 max_lens = get_max_lens(table)
3170 extra_gap += 1
3171 if delim:
3172 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3173 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3174 for row in table:
3175 for pos, text in enumerate(map(str, row)):
3176 if '\t' in text:
3177 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3178 else:
3179 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3180 ret = '\n'.join(''.join(row).rstrip() for row in table)
3181 return ret
3182
3183
3184 def _match_one(filter_part, dct, incomplete):
3185 # TODO: Generalize code with YoutubeDL._build_format_filter
3186 STRING_OPERATORS = {
3187 '*=': operator.contains,
3188 '^=': lambda attr, value: attr.startswith(value),
3189 '$=': lambda attr, value: attr.endswith(value),
3190 '~=': lambda attr, value: re.search(value, attr),
3191 }
3192 COMPARISON_OPERATORS = {
3193 **STRING_OPERATORS,
3194 '<=': operator.le, # "<=" must be defined above "<"
3195 '<': operator.lt,
3196 '>=': operator.ge,
3197 '>': operator.gt,
3198 '=': operator.eq,
3199 }
3200
3201 if isinstance(incomplete, bool):
3202 is_incomplete = lambda _: incomplete
3203 else:
3204 is_incomplete = lambda k: k in incomplete
3205
3206 operator_rex = re.compile(r'''(?x)
3207 (?P<key>[a-z_]+)
3208 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3209 (?:
3210 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3211 (?P<strval>.+?)
3212 )
3213 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3214 m = operator_rex.fullmatch(filter_part.strip())
3215 if m:
3216 m = m.groupdict()
3217 unnegated_op = COMPARISON_OPERATORS[m['op']]
3218 if m['negation']:
3219 op = lambda attr, value: not unnegated_op(attr, value)
3220 else:
3221 op = unnegated_op
3222 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3223 if m['quote']:
3224 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3225 actual_value = dct.get(m['key'])
3226 numeric_comparison = None
3227 if isinstance(actual_value, (int, float)):
3228 # If the original field is a string and matching comparisonvalue is
3229 # a number we should respect the origin of the original field
3230 # and process comparison value as a string (see
3231 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3232 try:
3233 numeric_comparison = int(comparison_value)
3234 except ValueError:
3235 numeric_comparison = parse_filesize(comparison_value)
3236 if numeric_comparison is None:
3237 numeric_comparison = parse_filesize(f'{comparison_value}B')
3238 if numeric_comparison is None:
3239 numeric_comparison = parse_duration(comparison_value)
3240 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3241 raise ValueError('Operator %s only supports string values!' % m['op'])
3242 if actual_value is None:
3243 return is_incomplete(m['key']) or m['none_inclusive']
3244 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3245
3246 UNARY_OPERATORS = {
3247 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3248 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3249 }
3250 operator_rex = re.compile(r'''(?x)
3251 (?P<op>%s)\s*(?P<key>[a-z_]+)
3252 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3253 m = operator_rex.fullmatch(filter_part.strip())
3254 if m:
3255 op = UNARY_OPERATORS[m.group('op')]
3256 actual_value = dct.get(m.group('key'))
3257 if is_incomplete(m.group('key')) and actual_value is None:
3258 return True
3259 return op(actual_value)
3260
3261 raise ValueError('Invalid filter part %r' % filter_part)
3262
3263
3264 def match_str(filter_str, dct, incomplete=False):
3265 """ Filter a dictionary with a simple string syntax.
3266 @returns Whether the filter passes
3267 @param incomplete Set of keys that is expected to be missing from dct.
3268 Can be True/False to indicate all/none of the keys may be missing.
3269 All conditions on incomplete keys pass if the key is missing
3270 """
3271 return all(
3272 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3273 for filter_part in re.split(r'(?<!\\)&', filter_str))
3274
3275
3276 def match_filter_func(filters, breaking_filters=None):
3277 if not filters and not breaking_filters:
3278 return None
3279 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3280 filters = set(variadic(filters or []))
3281
3282 interactive = '-' in filters
3283 if interactive:
3284 filters.remove('-')
3285
3286 def _match_func(info_dict, incomplete=False):
3287 ret = breaking_filters(info_dict, incomplete)
3288 if ret is not None:
3289 raise RejectedVideoReached(ret)
3290
3291 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3292 return NO_DEFAULT if interactive and not incomplete else None
3293 else:
3294 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3295 filter_str = ') | ('.join(map(str.strip, filters))
3296 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3297 return _match_func
3298
3299
3300 class download_range_func:
3301 def __init__(self, chapters, ranges, from_info=False):
3302 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3303
3304 def __call__(self, info_dict, ydl):
3305
3306 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3307 else 'Cannot match chapters since chapter information is unavailable')
3308 for regex in self.chapters or []:
3309 for i, chapter in enumerate(info_dict.get('chapters') or []):
3310 if re.search(regex, chapter['title']):
3311 warning = None
3312 yield {**chapter, 'index': i}
3313 if self.chapters and warning:
3314 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3315
3316 for start, end in self.ranges or []:
3317 yield {
3318 'start_time': self._handle_negative_timestamp(start, info_dict),
3319 'end_time': self._handle_negative_timestamp(end, info_dict),
3320 }
3321
3322 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3323 yield {
3324 'start_time': info_dict.get('start_time') or 0,
3325 'end_time': info_dict.get('end_time') or float('inf'),
3326 }
3327 elif not self.ranges and not self.chapters:
3328 yield {}
3329
3330 @staticmethod
3331 def _handle_negative_timestamp(time, info):
3332 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3333
3334 def __eq__(self, other):
3335 return (isinstance(other, download_range_func)
3336 and self.chapters == other.chapters and self.ranges == other.ranges)
3337
3338 def __repr__(self):
3339 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3340
3341
3342 def parse_dfxp_time_expr(time_expr):
3343 if not time_expr:
3344 return
3345
3346 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3347 if mobj:
3348 return float(mobj.group('time_offset'))
3349
3350 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3351 if mobj:
3352 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3353
3354
3355 def srt_subtitles_timecode(seconds):
3356 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3357
3358
3359 def ass_subtitles_timecode(seconds):
3360 time = timetuple_from_msec(seconds * 1000)
3361 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3362
3363
3364 def dfxp2srt(dfxp_data):
3365 '''
3366 @param dfxp_data A bytes-like object containing DFXP data
3367 @returns A unicode object containing converted SRT data
3368 '''
3369 LEGACY_NAMESPACES = (
3370 (b'http://www.w3.org/ns/ttml', [
3371 b'http://www.w3.org/2004/11/ttaf1',
3372 b'http://www.w3.org/2006/04/ttaf1',
3373 b'http://www.w3.org/2006/10/ttaf1',
3374 ]),
3375 (b'http://www.w3.org/ns/ttml#styling', [
3376 b'http://www.w3.org/ns/ttml#style',
3377 ]),
3378 )
3379
3380 SUPPORTED_STYLING = [
3381 'color',
3382 'fontFamily',
3383 'fontSize',
3384 'fontStyle',
3385 'fontWeight',
3386 'textDecoration'
3387 ]
3388
3389 _x = functools.partial(xpath_with_ns, ns_map={
3390 'xml': 'http://www.w3.org/XML/1998/namespace',
3391 'ttml': 'http://www.w3.org/ns/ttml',
3392 'tts': 'http://www.w3.org/ns/ttml#styling',
3393 })
3394
3395 styles = {}
3396 default_style = {}
3397
3398 class TTMLPElementParser:
3399 _out = ''
3400 _unclosed_elements = []
3401 _applied_styles = []
3402
3403 def start(self, tag, attrib):
3404 if tag in (_x('ttml:br'), 'br'):
3405 self._out += '\n'
3406 else:
3407 unclosed_elements = []
3408 style = {}
3409 element_style_id = attrib.get('style')
3410 if default_style:
3411 style.update(default_style)
3412 if element_style_id:
3413 style.update(styles.get(element_style_id, {}))
3414 for prop in SUPPORTED_STYLING:
3415 prop_val = attrib.get(_x('tts:' + prop))
3416 if prop_val:
3417 style[prop] = prop_val
3418 if style:
3419 font = ''
3420 for k, v in sorted(style.items()):
3421 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3422 continue
3423 if k == 'color':
3424 font += ' color="%s"' % v
3425 elif k == 'fontSize':
3426 font += ' size="%s"' % v
3427 elif k == 'fontFamily':
3428 font += ' face="%s"' % v
3429 elif k == 'fontWeight' and v == 'bold':
3430 self._out += '<b>'
3431 unclosed_elements.append('b')
3432 elif k == 'fontStyle' and v == 'italic':
3433 self._out += '<i>'
3434 unclosed_elements.append('i')
3435 elif k == 'textDecoration' and v == 'underline':
3436 self._out += '<u>'
3437 unclosed_elements.append('u')
3438 if font:
3439 self._out += '<font' + font + '>'
3440 unclosed_elements.append('font')
3441 applied_style = {}
3442 if self._applied_styles:
3443 applied_style.update(self._applied_styles[-1])
3444 applied_style.update(style)
3445 self._applied_styles.append(applied_style)
3446 self._unclosed_elements.append(unclosed_elements)
3447
3448 def end(self, tag):
3449 if tag not in (_x('ttml:br'), 'br'):
3450 unclosed_elements = self._unclosed_elements.pop()
3451 for element in reversed(unclosed_elements):
3452 self._out += '</%s>' % element
3453 if unclosed_elements and self._applied_styles:
3454 self._applied_styles.pop()
3455
3456 def data(self, data):
3457 self._out += data
3458
3459 def close(self):
3460 return self._out.strip()
3461
3462 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3463 # This will not trigger false positives since only UTF-8 text is being replaced
3464 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3465
3466 def parse_node(node):
3467 target = TTMLPElementParser()
3468 parser = xml.etree.ElementTree.XMLParser(target=target)
3469 parser.feed(xml.etree.ElementTree.tostring(node))
3470 return parser.close()
3471
3472 for k, v in LEGACY_NAMESPACES:
3473 for ns in v:
3474 dfxp_data = dfxp_data.replace(ns, k)
3475
3476 dfxp = compat_etree_fromstring(dfxp_data)
3477 out = []
3478 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3479
3480 if not paras:
3481 raise ValueError('Invalid dfxp/TTML subtitle')
3482
3483 repeat = False
3484 while True:
3485 for style in dfxp.findall(_x('.//ttml:style')):
3486 style_id = style.get('id') or style.get(_x('xml:id'))
3487 if not style_id:
3488 continue
3489 parent_style_id = style.get('style')
3490 if parent_style_id:
3491 if parent_style_id not in styles:
3492 repeat = True
3493 continue
3494 styles[style_id] = styles[parent_style_id].copy()
3495 for prop in SUPPORTED_STYLING:
3496 prop_val = style.get(_x('tts:' + prop))
3497 if prop_val:
3498 styles.setdefault(style_id, {})[prop] = prop_val
3499 if repeat:
3500 repeat = False
3501 else:
3502 break
3503
3504 for p in ('body', 'div'):
3505 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3506 if ele is None:
3507 continue
3508 style = styles.get(ele.get('style'))
3509 if not style:
3510 continue
3511 default_style.update(style)
3512
3513 for para, index in zip(paras, itertools.count(1)):
3514 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3515 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3516 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3517 if begin_time is None:
3518 continue
3519 if not end_time:
3520 if not dur:
3521 continue
3522 end_time = begin_time + dur
3523 out.append('%d\n%s --> %s\n%s\n\n' % (
3524 index,
3525 srt_subtitles_timecode(begin_time),
3526 srt_subtitles_timecode(end_time),
3527 parse_node(para)))
3528
3529 return ''.join(out)
3530
3531
3532 def cli_option(params, command_option, param, separator=None):
3533 param = params.get(param)
3534 return ([] if param is None
3535 else [command_option, str(param)] if separator is None
3536 else [f'{command_option}{separator}{param}'])
3537
3538
3539 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3540 param = params.get(param)
3541 assert param in (True, False, None)
3542 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3543
3544
3545 def cli_valueless_option(params, command_option, param, expected_value=True):
3546 return [command_option] if params.get(param) == expected_value else []
3547
3548
3549 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3550 if isinstance(argdict, (list, tuple)): # for backward compatibility
3551 if use_compat:
3552 return argdict
3553 else:
3554 argdict = None
3555 if argdict is None:
3556 return default
3557 assert isinstance(argdict, dict)
3558
3559 assert isinstance(keys, (list, tuple))
3560 for key_list in keys:
3561 arg_list = list(filter(
3562 lambda x: x is not None,
3563 [argdict.get(key.lower()) for key in variadic(key_list)]))
3564 if arg_list:
3565 return [arg for args in arg_list for arg in args]
3566 return default
3567
3568
3569 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3570 main_key, exe = main_key.lower(), exe.lower()
3571 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3572 keys = [f'{root_key}{k}' for k in (keys or [''])]
3573 if root_key in keys:
3574 if main_key != exe:
3575 keys.append((main_key, exe))
3576 keys.append('default')
3577 else:
3578 use_compat = False
3579 return cli_configuration_args(argdict, keys, default, use_compat)
3580
3581
3582 class ISO639Utils:
3583 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3584 _lang_map = {
3585 'aa': 'aar',
3586 'ab': 'abk',
3587 'ae': 'ave',
3588 'af': 'afr',
3589 'ak': 'aka',
3590 'am': 'amh',
3591 'an': 'arg',
3592 'ar': 'ara',
3593 'as': 'asm',
3594 'av': 'ava',
3595 'ay': 'aym',
3596 'az': 'aze',
3597 'ba': 'bak',
3598 'be': 'bel',
3599 'bg': 'bul',
3600 'bh': 'bih',
3601 'bi': 'bis',
3602 'bm': 'bam',
3603 'bn': 'ben',
3604 'bo': 'bod',
3605 'br': 'bre',
3606 'bs': 'bos',
3607 'ca': 'cat',
3608 'ce': 'che',
3609 'ch': 'cha',
3610 'co': 'cos',
3611 'cr': 'cre',
3612 'cs': 'ces',
3613 'cu': 'chu',
3614 'cv': 'chv',
3615 'cy': 'cym',
3616 'da': 'dan',
3617 'de': 'deu',
3618 'dv': 'div',
3619 'dz': 'dzo',
3620 'ee': 'ewe',
3621 'el': 'ell',
3622 'en': 'eng',
3623 'eo': 'epo',
3624 'es': 'spa',
3625 'et': 'est',
3626 'eu': 'eus',
3627 'fa': 'fas',
3628 'ff': 'ful',
3629 'fi': 'fin',
3630 'fj': 'fij',
3631 'fo': 'fao',
3632 'fr': 'fra',
3633 'fy': 'fry',
3634 'ga': 'gle',
3635 'gd': 'gla',
3636 'gl': 'glg',
3637 'gn': 'grn',
3638 'gu': 'guj',
3639 'gv': 'glv',
3640 'ha': 'hau',
3641 'he': 'heb',
3642 'iw': 'heb', # Replaced by he in 1989 revision
3643 'hi': 'hin',
3644 'ho': 'hmo',
3645 'hr': 'hrv',
3646 'ht': 'hat',
3647 'hu': 'hun',
3648 'hy': 'hye',
3649 'hz': 'her',
3650 'ia': 'ina',
3651 'id': 'ind',
3652 'in': 'ind', # Replaced by id in 1989 revision
3653 'ie': 'ile',
3654 'ig': 'ibo',
3655 'ii': 'iii',
3656 'ik': 'ipk',
3657 'io': 'ido',
3658 'is': 'isl',
3659 'it': 'ita',
3660 'iu': 'iku',
3661 'ja': 'jpn',
3662 'jv': 'jav',
3663 'ka': 'kat',
3664 'kg': 'kon',
3665 'ki': 'kik',
3666 'kj': 'kua',
3667 'kk': 'kaz',
3668 'kl': 'kal',
3669 'km': 'khm',
3670 'kn': 'kan',
3671 'ko': 'kor',
3672 'kr': 'kau',
3673 'ks': 'kas',
3674 'ku': 'kur',
3675 'kv': 'kom',
3676 'kw': 'cor',
3677 'ky': 'kir',
3678 'la': 'lat',
3679 'lb': 'ltz',
3680 'lg': 'lug',
3681 'li': 'lim',
3682 'ln': 'lin',
3683 'lo': 'lao',
3684 'lt': 'lit',
3685 'lu': 'lub',
3686 'lv': 'lav',
3687 'mg': 'mlg',
3688 'mh': 'mah',
3689 'mi': 'mri',
3690 'mk': 'mkd',
3691 'ml': 'mal',
3692 'mn': 'mon',
3693 'mr': 'mar',
3694 'ms': 'msa',
3695 'mt': 'mlt',
3696 'my': 'mya',
3697 'na': 'nau',
3698 'nb': 'nob',
3699 'nd': 'nde',
3700 'ne': 'nep',
3701 'ng': 'ndo',
3702 'nl': 'nld',
3703 'nn': 'nno',
3704 'no': 'nor',
3705 'nr': 'nbl',
3706 'nv': 'nav',
3707 'ny': 'nya',
3708 'oc': 'oci',
3709 'oj': 'oji',
3710 'om': 'orm',
3711 'or': 'ori',
3712 'os': 'oss',
3713 'pa': 'pan',
3714 'pe': 'per',
3715 'pi': 'pli',
3716 'pl': 'pol',
3717 'ps': 'pus',
3718 'pt': 'por',
3719 'qu': 'que',
3720 'rm': 'roh',
3721 'rn': 'run',
3722 'ro': 'ron',
3723 'ru': 'rus',
3724 'rw': 'kin',
3725 'sa': 'san',
3726 'sc': 'srd',
3727 'sd': 'snd',
3728 'se': 'sme',
3729 'sg': 'sag',
3730 'si': 'sin',
3731 'sk': 'slk',
3732 'sl': 'slv',
3733 'sm': 'smo',
3734 'sn': 'sna',
3735 'so': 'som',
3736 'sq': 'sqi',
3737 'sr': 'srp',
3738 'ss': 'ssw',
3739 'st': 'sot',
3740 'su': 'sun',
3741 'sv': 'swe',
3742 'sw': 'swa',
3743 'ta': 'tam',
3744 'te': 'tel',
3745 'tg': 'tgk',
3746 'th': 'tha',
3747 'ti': 'tir',
3748 'tk': 'tuk',
3749 'tl': 'tgl',
3750 'tn': 'tsn',
3751 'to': 'ton',
3752 'tr': 'tur',
3753 'ts': 'tso',
3754 'tt': 'tat',
3755 'tw': 'twi',
3756 'ty': 'tah',
3757 'ug': 'uig',
3758 'uk': 'ukr',
3759 'ur': 'urd',
3760 'uz': 'uzb',
3761 've': 'ven',
3762 'vi': 'vie',
3763 'vo': 'vol',
3764 'wa': 'wln',
3765 'wo': 'wol',
3766 'xh': 'xho',
3767 'yi': 'yid',
3768 'ji': 'yid', # Replaced by yi in 1989 revision
3769 'yo': 'yor',
3770 'za': 'zha',
3771 'zh': 'zho',
3772 'zu': 'zul',
3773 }
3774
3775 @classmethod
3776 def short2long(cls, code):
3777 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3778 return cls._lang_map.get(code[:2])
3779
3780 @classmethod
3781 def long2short(cls, code):
3782 """Convert language code from ISO 639-2/T to ISO 639-1"""
3783 for short_name, long_name in cls._lang_map.items():
3784 if long_name == code:
3785 return short_name
3786
3787
3788 class ISO3166Utils:
3789 # From http://data.okfn.org/data/core/country-list
3790 _country_map = {
3791 'AF': 'Afghanistan',
3792 'AX': 'Åland Islands',
3793 'AL': 'Albania',
3794 'DZ': 'Algeria',
3795 'AS': 'American Samoa',
3796 'AD': 'Andorra',
3797 'AO': 'Angola',
3798 'AI': 'Anguilla',
3799 'AQ': 'Antarctica',
3800 'AG': 'Antigua and Barbuda',
3801 'AR': 'Argentina',
3802 'AM': 'Armenia',
3803 'AW': 'Aruba',
3804 'AU': 'Australia',
3805 'AT': 'Austria',
3806 'AZ': 'Azerbaijan',
3807 'BS': 'Bahamas',
3808 'BH': 'Bahrain',
3809 'BD': 'Bangladesh',
3810 'BB': 'Barbados',
3811 'BY': 'Belarus',
3812 'BE': 'Belgium',
3813 'BZ': 'Belize',
3814 'BJ': 'Benin',
3815 'BM': 'Bermuda',
3816 'BT': 'Bhutan',
3817 'BO': 'Bolivia, Plurinational State of',
3818 'BQ': 'Bonaire, Sint Eustatius and Saba',
3819 'BA': 'Bosnia and Herzegovina',
3820 'BW': 'Botswana',
3821 'BV': 'Bouvet Island',
3822 'BR': 'Brazil',
3823 'IO': 'British Indian Ocean Territory',
3824 'BN': 'Brunei Darussalam',
3825 'BG': 'Bulgaria',
3826 'BF': 'Burkina Faso',
3827 'BI': 'Burundi',
3828 'KH': 'Cambodia',
3829 'CM': 'Cameroon',
3830 'CA': 'Canada',
3831 'CV': 'Cape Verde',
3832 'KY': 'Cayman Islands',
3833 'CF': 'Central African Republic',
3834 'TD': 'Chad',
3835 'CL': 'Chile',
3836 'CN': 'China',
3837 'CX': 'Christmas Island',
3838 'CC': 'Cocos (Keeling) Islands',
3839 'CO': 'Colombia',
3840 'KM': 'Comoros',
3841 'CG': 'Congo',
3842 'CD': 'Congo, the Democratic Republic of the',
3843 'CK': 'Cook Islands',
3844 'CR': 'Costa Rica',
3845 'CI': 'Côte d\'Ivoire',
3846 'HR': 'Croatia',
3847 'CU': 'Cuba',
3848 'CW': 'Curaçao',
3849 'CY': 'Cyprus',
3850 'CZ': 'Czech Republic',
3851 'DK': 'Denmark',
3852 'DJ': 'Djibouti',
3853 'DM': 'Dominica',
3854 'DO': 'Dominican Republic',
3855 'EC': 'Ecuador',
3856 'EG': 'Egypt',
3857 'SV': 'El Salvador',
3858 'GQ': 'Equatorial Guinea',
3859 'ER': 'Eritrea',
3860 'EE': 'Estonia',
3861 'ET': 'Ethiopia',
3862 'FK': 'Falkland Islands (Malvinas)',
3863 'FO': 'Faroe Islands',
3864 'FJ': 'Fiji',
3865 'FI': 'Finland',
3866 'FR': 'France',
3867 'GF': 'French Guiana',
3868 'PF': 'French Polynesia',
3869 'TF': 'French Southern Territories',
3870 'GA': 'Gabon',
3871 'GM': 'Gambia',
3872 'GE': 'Georgia',
3873 'DE': 'Germany',
3874 'GH': 'Ghana',
3875 'GI': 'Gibraltar',
3876 'GR': 'Greece',
3877 'GL': 'Greenland',
3878 'GD': 'Grenada',
3879 'GP': 'Guadeloupe',
3880 'GU': 'Guam',
3881 'GT': 'Guatemala',
3882 'GG': 'Guernsey',
3883 'GN': 'Guinea',
3884 'GW': 'Guinea-Bissau',
3885 'GY': 'Guyana',
3886 'HT': 'Haiti',
3887 'HM': 'Heard Island and McDonald Islands',
3888 'VA': 'Holy See (Vatican City State)',
3889 'HN': 'Honduras',
3890 'HK': 'Hong Kong',
3891 'HU': 'Hungary',
3892 'IS': 'Iceland',
3893 'IN': 'India',
3894 'ID': 'Indonesia',
3895 'IR': 'Iran, Islamic Republic of',
3896 'IQ': 'Iraq',
3897 'IE': 'Ireland',
3898 'IM': 'Isle of Man',
3899 'IL': 'Israel',
3900 'IT': 'Italy',
3901 'JM': 'Jamaica',
3902 'JP': 'Japan',
3903 'JE': 'Jersey',
3904 'JO': 'Jordan',
3905 'KZ': 'Kazakhstan',
3906 'KE': 'Kenya',
3907 'KI': 'Kiribati',
3908 'KP': 'Korea, Democratic People\'s Republic of',
3909 'KR': 'Korea, Republic of',
3910 'KW': 'Kuwait',
3911 'KG': 'Kyrgyzstan',
3912 'LA': 'Lao People\'s Democratic Republic',
3913 'LV': 'Latvia',
3914 'LB': 'Lebanon',
3915 'LS': 'Lesotho',
3916 'LR': 'Liberia',
3917 'LY': 'Libya',
3918 'LI': 'Liechtenstein',
3919 'LT': 'Lithuania',
3920 'LU': 'Luxembourg',
3921 'MO': 'Macao',
3922 'MK': 'Macedonia, the Former Yugoslav Republic of',
3923 'MG': 'Madagascar',
3924 'MW': 'Malawi',
3925 'MY': 'Malaysia',
3926 'MV': 'Maldives',
3927 'ML': 'Mali',
3928 'MT': 'Malta',
3929 'MH': 'Marshall Islands',
3930 'MQ': 'Martinique',
3931 'MR': 'Mauritania',
3932 'MU': 'Mauritius',
3933 'YT': 'Mayotte',
3934 'MX': 'Mexico',
3935 'FM': 'Micronesia, Federated States of',
3936 'MD': 'Moldova, Republic of',
3937 'MC': 'Monaco',
3938 'MN': 'Mongolia',
3939 'ME': 'Montenegro',
3940 'MS': 'Montserrat',
3941 'MA': 'Morocco',
3942 'MZ': 'Mozambique',
3943 'MM': 'Myanmar',
3944 'NA': 'Namibia',
3945 'NR': 'Nauru',
3946 'NP': 'Nepal',
3947 'NL': 'Netherlands',
3948 'NC': 'New Caledonia',
3949 'NZ': 'New Zealand',
3950 'NI': 'Nicaragua',
3951 'NE': 'Niger',
3952 'NG': 'Nigeria',
3953 'NU': 'Niue',
3954 'NF': 'Norfolk Island',
3955 'MP': 'Northern Mariana Islands',
3956 'NO': 'Norway',
3957 'OM': 'Oman',
3958 'PK': 'Pakistan',
3959 'PW': 'Palau',
3960 'PS': 'Palestine, State of',
3961 'PA': 'Panama',
3962 'PG': 'Papua New Guinea',
3963 'PY': 'Paraguay',
3964 'PE': 'Peru',
3965 'PH': 'Philippines',
3966 'PN': 'Pitcairn',
3967 'PL': 'Poland',
3968 'PT': 'Portugal',
3969 'PR': 'Puerto Rico',
3970 'QA': 'Qatar',
3971 'RE': 'Réunion',
3972 'RO': 'Romania',
3973 'RU': 'Russian Federation',
3974 'RW': 'Rwanda',
3975 'BL': 'Saint Barthélemy',
3976 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3977 'KN': 'Saint Kitts and Nevis',
3978 'LC': 'Saint Lucia',
3979 'MF': 'Saint Martin (French part)',
3980 'PM': 'Saint Pierre and Miquelon',
3981 'VC': 'Saint Vincent and the Grenadines',
3982 'WS': 'Samoa',
3983 'SM': 'San Marino',
3984 'ST': 'Sao Tome and Principe',
3985 'SA': 'Saudi Arabia',
3986 'SN': 'Senegal',
3987 'RS': 'Serbia',
3988 'SC': 'Seychelles',
3989 'SL': 'Sierra Leone',
3990 'SG': 'Singapore',
3991 'SX': 'Sint Maarten (Dutch part)',
3992 'SK': 'Slovakia',
3993 'SI': 'Slovenia',
3994 'SB': 'Solomon Islands',
3995 'SO': 'Somalia',
3996 'ZA': 'South Africa',
3997 'GS': 'South Georgia and the South Sandwich Islands',
3998 'SS': 'South Sudan',
3999 'ES': 'Spain',
4000 'LK': 'Sri Lanka',
4001 'SD': 'Sudan',
4002 'SR': 'Suriname',
4003 'SJ': 'Svalbard and Jan Mayen',
4004 'SZ': 'Swaziland',
4005 'SE': 'Sweden',
4006 'CH': 'Switzerland',
4007 'SY': 'Syrian Arab Republic',
4008 'TW': 'Taiwan, Province of China',
4009 'TJ': 'Tajikistan',
4010 'TZ': 'Tanzania, United Republic of',
4011 'TH': 'Thailand',
4012 'TL': 'Timor-Leste',
4013 'TG': 'Togo',
4014 'TK': 'Tokelau',
4015 'TO': 'Tonga',
4016 'TT': 'Trinidad and Tobago',
4017 'TN': 'Tunisia',
4018 'TR': 'Turkey',
4019 'TM': 'Turkmenistan',
4020 'TC': 'Turks and Caicos Islands',
4021 'TV': 'Tuvalu',
4022 'UG': 'Uganda',
4023 'UA': 'Ukraine',
4024 'AE': 'United Arab Emirates',
4025 'GB': 'United Kingdom',
4026 'US': 'United States',
4027 'UM': 'United States Minor Outlying Islands',
4028 'UY': 'Uruguay',
4029 'UZ': 'Uzbekistan',
4030 'VU': 'Vanuatu',
4031 'VE': 'Venezuela, Bolivarian Republic of',
4032 'VN': 'Viet Nam',
4033 'VG': 'Virgin Islands, British',
4034 'VI': 'Virgin Islands, U.S.',
4035 'WF': 'Wallis and Futuna',
4036 'EH': 'Western Sahara',
4037 'YE': 'Yemen',
4038 'ZM': 'Zambia',
4039 'ZW': 'Zimbabwe',
4040 # Not ISO 3166 codes, but used for IP blocks
4041 'AP': 'Asia/Pacific Region',
4042 'EU': 'Europe',
4043 }
4044
4045 @classmethod
4046 def short2full(cls, code):
4047 """Convert an ISO 3166-2 country code to the corresponding full name"""
4048 return cls._country_map.get(code.upper())
4049
4050
4051 class GeoUtils:
4052 # Major IPv4 address blocks per country
4053 _country_ip_map = {
4054 'AD': '46.172.224.0/19',
4055 'AE': '94.200.0.0/13',
4056 'AF': '149.54.0.0/17',
4057 'AG': '209.59.64.0/18',
4058 'AI': '204.14.248.0/21',
4059 'AL': '46.99.0.0/16',
4060 'AM': '46.70.0.0/15',
4061 'AO': '105.168.0.0/13',
4062 'AP': '182.50.184.0/21',
4063 'AQ': '23.154.160.0/24',
4064 'AR': '181.0.0.0/12',
4065 'AS': '202.70.112.0/20',
4066 'AT': '77.116.0.0/14',
4067 'AU': '1.128.0.0/11',
4068 'AW': '181.41.0.0/18',
4069 'AX': '185.217.4.0/22',
4070 'AZ': '5.197.0.0/16',
4071 'BA': '31.176.128.0/17',
4072 'BB': '65.48.128.0/17',
4073 'BD': '114.130.0.0/16',
4074 'BE': '57.0.0.0/8',
4075 'BF': '102.178.0.0/15',
4076 'BG': '95.42.0.0/15',
4077 'BH': '37.131.0.0/17',
4078 'BI': '154.117.192.0/18',
4079 'BJ': '137.255.0.0/16',
4080 'BL': '185.212.72.0/23',
4081 'BM': '196.12.64.0/18',
4082 'BN': '156.31.0.0/16',
4083 'BO': '161.56.0.0/16',
4084 'BQ': '161.0.80.0/20',
4085 'BR': '191.128.0.0/12',
4086 'BS': '24.51.64.0/18',
4087 'BT': '119.2.96.0/19',
4088 'BW': '168.167.0.0/16',
4089 'BY': '178.120.0.0/13',
4090 'BZ': '179.42.192.0/18',
4091 'CA': '99.224.0.0/11',
4092 'CD': '41.243.0.0/16',
4093 'CF': '197.242.176.0/21',
4094 'CG': '160.113.0.0/16',
4095 'CH': '85.0.0.0/13',
4096 'CI': '102.136.0.0/14',
4097 'CK': '202.65.32.0/19',
4098 'CL': '152.172.0.0/14',
4099 'CM': '102.244.0.0/14',
4100 'CN': '36.128.0.0/10',
4101 'CO': '181.240.0.0/12',
4102 'CR': '201.192.0.0/12',
4103 'CU': '152.206.0.0/15',
4104 'CV': '165.90.96.0/19',
4105 'CW': '190.88.128.0/17',
4106 'CY': '31.153.0.0/16',
4107 'CZ': '88.100.0.0/14',
4108 'DE': '53.0.0.0/8',
4109 'DJ': '197.241.0.0/17',
4110 'DK': '87.48.0.0/12',
4111 'DM': '192.243.48.0/20',
4112 'DO': '152.166.0.0/15',
4113 'DZ': '41.96.0.0/12',
4114 'EC': '186.68.0.0/15',
4115 'EE': '90.190.0.0/15',
4116 'EG': '156.160.0.0/11',
4117 'ER': '196.200.96.0/20',
4118 'ES': '88.0.0.0/11',
4119 'ET': '196.188.0.0/14',
4120 'EU': '2.16.0.0/13',
4121 'FI': '91.152.0.0/13',
4122 'FJ': '144.120.0.0/16',
4123 'FK': '80.73.208.0/21',
4124 'FM': '119.252.112.0/20',
4125 'FO': '88.85.32.0/19',
4126 'FR': '90.0.0.0/9',
4127 'GA': '41.158.0.0/15',
4128 'GB': '25.0.0.0/8',
4129 'GD': '74.122.88.0/21',
4130 'GE': '31.146.0.0/16',
4131 'GF': '161.22.64.0/18',
4132 'GG': '62.68.160.0/19',
4133 'GH': '154.160.0.0/12',
4134 'GI': '95.164.0.0/16',
4135 'GL': '88.83.0.0/19',
4136 'GM': '160.182.0.0/15',
4137 'GN': '197.149.192.0/18',
4138 'GP': '104.250.0.0/19',
4139 'GQ': '105.235.224.0/20',
4140 'GR': '94.64.0.0/13',
4141 'GT': '168.234.0.0/16',
4142 'GU': '168.123.0.0/16',
4143 'GW': '197.214.80.0/20',
4144 'GY': '181.41.64.0/18',
4145 'HK': '113.252.0.0/14',
4146 'HN': '181.210.0.0/16',
4147 'HR': '93.136.0.0/13',
4148 'HT': '148.102.128.0/17',
4149 'HU': '84.0.0.0/14',
4150 'ID': '39.192.0.0/10',
4151 'IE': '87.32.0.0/12',
4152 'IL': '79.176.0.0/13',
4153 'IM': '5.62.80.0/20',
4154 'IN': '117.192.0.0/10',
4155 'IO': '203.83.48.0/21',
4156 'IQ': '37.236.0.0/14',
4157 'IR': '2.176.0.0/12',
4158 'IS': '82.221.0.0/16',
4159 'IT': '79.0.0.0/10',
4160 'JE': '87.244.64.0/18',
4161 'JM': '72.27.0.0/17',
4162 'JO': '176.29.0.0/16',
4163 'JP': '133.0.0.0/8',
4164 'KE': '105.48.0.0/12',
4165 'KG': '158.181.128.0/17',
4166 'KH': '36.37.128.0/17',
4167 'KI': '103.25.140.0/22',
4168 'KM': '197.255.224.0/20',
4169 'KN': '198.167.192.0/19',
4170 'KP': '175.45.176.0/22',
4171 'KR': '175.192.0.0/10',
4172 'KW': '37.36.0.0/14',
4173 'KY': '64.96.0.0/15',
4174 'KZ': '2.72.0.0/13',
4175 'LA': '115.84.64.0/18',
4176 'LB': '178.135.0.0/16',
4177 'LC': '24.92.144.0/20',
4178 'LI': '82.117.0.0/19',
4179 'LK': '112.134.0.0/15',
4180 'LR': '102.183.0.0/16',
4181 'LS': '129.232.0.0/17',
4182 'LT': '78.56.0.0/13',
4183 'LU': '188.42.0.0/16',
4184 'LV': '46.109.0.0/16',
4185 'LY': '41.252.0.0/14',
4186 'MA': '105.128.0.0/11',
4187 'MC': '88.209.64.0/18',
4188 'MD': '37.246.0.0/16',
4189 'ME': '178.175.0.0/17',
4190 'MF': '74.112.232.0/21',
4191 'MG': '154.126.0.0/17',
4192 'MH': '117.103.88.0/21',
4193 'MK': '77.28.0.0/15',
4194 'ML': '154.118.128.0/18',
4195 'MM': '37.111.0.0/17',
4196 'MN': '49.0.128.0/17',
4197 'MO': '60.246.0.0/16',
4198 'MP': '202.88.64.0/20',
4199 'MQ': '109.203.224.0/19',
4200 'MR': '41.188.64.0/18',
4201 'MS': '208.90.112.0/22',
4202 'MT': '46.11.0.0/16',
4203 'MU': '105.16.0.0/12',
4204 'MV': '27.114.128.0/18',
4205 'MW': '102.70.0.0/15',
4206 'MX': '187.192.0.0/11',
4207 'MY': '175.136.0.0/13',
4208 'MZ': '197.218.0.0/15',
4209 'NA': '41.182.0.0/16',
4210 'NC': '101.101.0.0/18',
4211 'NE': '197.214.0.0/18',
4212 'NF': '203.17.240.0/22',
4213 'NG': '105.112.0.0/12',
4214 'NI': '186.76.0.0/15',
4215 'NL': '145.96.0.0/11',
4216 'NO': '84.208.0.0/13',
4217 'NP': '36.252.0.0/15',
4218 'NR': '203.98.224.0/19',
4219 'NU': '49.156.48.0/22',
4220 'NZ': '49.224.0.0/14',
4221 'OM': '5.36.0.0/15',
4222 'PA': '186.72.0.0/15',
4223 'PE': '186.160.0.0/14',
4224 'PF': '123.50.64.0/18',
4225 'PG': '124.240.192.0/19',
4226 'PH': '49.144.0.0/13',
4227 'PK': '39.32.0.0/11',
4228 'PL': '83.0.0.0/11',
4229 'PM': '70.36.0.0/20',
4230 'PR': '66.50.0.0/16',
4231 'PS': '188.161.0.0/16',
4232 'PT': '85.240.0.0/13',
4233 'PW': '202.124.224.0/20',
4234 'PY': '181.120.0.0/14',
4235 'QA': '37.210.0.0/15',
4236 'RE': '102.35.0.0/16',
4237 'RO': '79.112.0.0/13',
4238 'RS': '93.86.0.0/15',
4239 'RU': '5.136.0.0/13',
4240 'RW': '41.186.0.0/16',
4241 'SA': '188.48.0.0/13',
4242 'SB': '202.1.160.0/19',
4243 'SC': '154.192.0.0/11',
4244 'SD': '102.120.0.0/13',
4245 'SE': '78.64.0.0/12',
4246 'SG': '8.128.0.0/10',
4247 'SI': '188.196.0.0/14',
4248 'SK': '78.98.0.0/15',
4249 'SL': '102.143.0.0/17',
4250 'SM': '89.186.32.0/19',
4251 'SN': '41.82.0.0/15',
4252 'SO': '154.115.192.0/18',
4253 'SR': '186.179.128.0/17',
4254 'SS': '105.235.208.0/21',
4255 'ST': '197.159.160.0/19',
4256 'SV': '168.243.0.0/16',
4257 'SX': '190.102.0.0/20',
4258 'SY': '5.0.0.0/16',
4259 'SZ': '41.84.224.0/19',
4260 'TC': '65.255.48.0/20',
4261 'TD': '154.68.128.0/19',
4262 'TG': '196.168.0.0/14',
4263 'TH': '171.96.0.0/13',
4264 'TJ': '85.9.128.0/18',
4265 'TK': '27.96.24.0/21',
4266 'TL': '180.189.160.0/20',
4267 'TM': '95.85.96.0/19',
4268 'TN': '197.0.0.0/11',
4269 'TO': '175.176.144.0/21',
4270 'TR': '78.160.0.0/11',
4271 'TT': '186.44.0.0/15',
4272 'TV': '202.2.96.0/19',
4273 'TW': '120.96.0.0/11',
4274 'TZ': '156.156.0.0/14',
4275 'UA': '37.52.0.0/14',
4276 'UG': '102.80.0.0/13',
4277 'US': '6.0.0.0/8',
4278 'UY': '167.56.0.0/13',
4279 'UZ': '84.54.64.0/18',
4280 'VA': '212.77.0.0/19',
4281 'VC': '207.191.240.0/21',
4282 'VE': '186.88.0.0/13',
4283 'VG': '66.81.192.0/20',
4284 'VI': '146.226.0.0/16',
4285 'VN': '14.160.0.0/11',
4286 'VU': '202.80.32.0/20',
4287 'WF': '117.20.32.0/21',
4288 'WS': '202.4.32.0/19',
4289 'YE': '134.35.0.0/16',
4290 'YT': '41.242.116.0/22',
4291 'ZA': '41.0.0.0/11',
4292 'ZM': '102.144.0.0/13',
4293 'ZW': '102.177.192.0/18',
4294 }
4295
4296 @classmethod
4297 def random_ipv4(cls, code_or_block):
4298 if len(code_or_block) == 2:
4299 block = cls._country_ip_map.get(code_or_block.upper())
4300 if not block:
4301 return None
4302 else:
4303 block = code_or_block
4304 addr, preflen = block.split('/')
4305 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4306 addr_max = addr_min | (0xffffffff >> int(preflen))
4307 return str(socket.inet_ntoa(
4308 struct.pack('!L', random.randint(addr_min, addr_max))))
4309
4310
4311 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4312 # released into Public Domain
4313 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4314
4315 def long_to_bytes(n, blocksize=0):
4316 """long_to_bytes(n:long, blocksize:int) : string
4317 Convert a long integer to a byte string.
4318
4319 If optional blocksize is given and greater than zero, pad the front of the
4320 byte string with binary zeros so that the length is a multiple of
4321 blocksize.
4322 """
4323 # after much testing, this algorithm was deemed to be the fastest
4324 s = b''
4325 n = int(n)
4326 while n > 0:
4327 s = struct.pack('>I', n & 0xffffffff) + s
4328 n = n >> 32
4329 # strip off leading zeros
4330 for i in range(len(s)):
4331 if s[i] != b'\000'[0]:
4332 break
4333 else:
4334 # only happens when n == 0
4335 s = b'\000'
4336 i = 0
4337 s = s[i:]
4338 # add back some pad bytes. this could be done more efficiently w.r.t. the
4339 # de-padding being done above, but sigh...
4340 if blocksize > 0 and len(s) % blocksize:
4341 s = (blocksize - len(s) % blocksize) * b'\000' + s
4342 return s
4343
4344
4345 def bytes_to_long(s):
4346 """bytes_to_long(string) : long
4347 Convert a byte string to a long integer.
4348
4349 This is (essentially) the inverse of long_to_bytes().
4350 """
4351 acc = 0
4352 length = len(s)
4353 if length % 4:
4354 extra = (4 - length % 4)
4355 s = b'\000' * extra + s
4356 length = length + extra
4357 for i in range(0, length, 4):
4358 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4359 return acc
4360
4361
4362 def ohdave_rsa_encrypt(data, exponent, modulus):
4363 '''
4364 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4365
4366 Input:
4367 data: data to encrypt, bytes-like object
4368 exponent, modulus: parameter e and N of RSA algorithm, both integer
4369 Output: hex string of encrypted data
4370
4371 Limitation: supports one block encryption only
4372 '''
4373
4374 payload = int(binascii.hexlify(data[::-1]), 16)
4375 encrypted = pow(payload, exponent, modulus)
4376 return '%x' % encrypted
4377
4378
4379 def pkcs1pad(data, length):
4380 """
4381 Padding input data with PKCS#1 scheme
4382
4383 @param {int[]} data input data
4384 @param {int} length target length
4385 @returns {int[]} padded data
4386 """
4387 if len(data) > length - 11:
4388 raise ValueError('Input data too long for PKCS#1 padding')
4389
4390 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4391 return [0, 2] + pseudo_random + [0] + data
4392
4393
4394 def _base_n_table(n, table):
4395 if not table and not n:
4396 raise ValueError('Either table or n must be specified')
4397 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4398
4399 if n and n != len(table):
4400 raise ValueError(f'base {n} exceeds table length {len(table)}')
4401 return table
4402
4403
4404 def encode_base_n(num, n=None, table=None):
4405 """Convert given int to a base-n string"""
4406 table = _base_n_table(n, table)
4407 if not num:
4408 return table[0]
4409
4410 result, base = '', len(table)
4411 while num:
4412 result = table[num % base] + result
4413 num = num // base
4414 return result
4415
4416
4417 def decode_base_n(string, n=None, table=None):
4418 """Convert given base-n string to int"""
4419 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4420 result, base = 0, len(table)
4421 for char in string:
4422 result = result * base + table[char]
4423 return result
4424
4425
4426 def decode_packed_codes(code):
4427 mobj = re.search(PACKED_CODES_RE, code)
4428 obfuscated_code, base, count, symbols = mobj.groups()
4429 base = int(base)
4430 count = int(count)
4431 symbols = symbols.split('|')
4432 symbol_table = {}
4433
4434 while count:
4435 count -= 1
4436 base_n_count = encode_base_n(count, base)
4437 symbol_table[base_n_count] = symbols[count] or base_n_count
4438
4439 return re.sub(
4440 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4441 obfuscated_code)
4442
4443
4444 def caesar(s, alphabet, shift):
4445 if shift == 0:
4446 return s
4447 l = len(alphabet)
4448 return ''.join(
4449 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4450 for c in s)
4451
4452
4453 def rot47(s):
4454 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4455
4456
4457 def parse_m3u8_attributes(attrib):
4458 info = {}
4459 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4460 if val.startswith('"'):
4461 val = val[1:-1]
4462 info[key] = val
4463 return info
4464
4465
4466 def urshift(val, n):
4467 return val >> n if val >= 0 else (val + 0x100000000) >> n
4468
4469
4470 def write_xattr(path, key, value):
4471 # Windows: Write xattrs to NTFS Alternate Data Streams:
4472 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4473 if compat_os_name == 'nt':
4474 assert ':' not in key
4475 assert os.path.exists(path)
4476
4477 try:
4478 with open(f'{path}:{key}', 'wb') as f:
4479 f.write(value)
4480 except OSError as e:
4481 raise XAttrMetadataError(e.errno, e.strerror)
4482 return
4483
4484 # UNIX Method 1. Use xattrs/pyxattrs modules
4485
4486 setxattr = None
4487 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4488 # Unicode arguments are not supported in pyxattr until version 0.5.0
4489 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4490 if version_tuple(xattr.__version__) >= (0, 5, 0):
4491 setxattr = xattr.set
4492 elif xattr:
4493 setxattr = xattr.setxattr
4494
4495 if setxattr:
4496 try:
4497 setxattr(path, key, value)
4498 except OSError as e:
4499 raise XAttrMetadataError(e.errno, e.strerror)
4500 return
4501
4502 # UNIX Method 2. Use setfattr/xattr executables
4503 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4504 else 'xattr' if check_executable('xattr', ['-h']) else None)
4505 if not exe:
4506 raise XAttrUnavailableError(
4507 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4508 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4509
4510 value = value.decode()
4511 try:
4512 _, stderr, returncode = Popen.run(
4513 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4514 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4515 except OSError as e:
4516 raise XAttrMetadataError(e.errno, e.strerror)
4517 if returncode:
4518 raise XAttrMetadataError(returncode, stderr)
4519
4520
4521 def random_birthday(year_field, month_field, day_field):
4522 start_date = datetime.date(1950, 1, 1)
4523 end_date = datetime.date(1995, 12, 31)
4524 offset = random.randint(0, (end_date - start_date).days)
4525 random_date = start_date + datetime.timedelta(offset)
4526 return {
4527 year_field: str(random_date.year),
4528 month_field: str(random_date.month),
4529 day_field: str(random_date.day),
4530 }
4531
4532
4533 def find_available_port(interface=''):
4534 try:
4535 with socket.socket() as sock:
4536 sock.bind((interface, 0))
4537 return sock.getsockname()[1]
4538 except OSError:
4539 return None
4540
4541
4542 # Templates for internet shortcut files, which are plain text files.
4543 DOT_URL_LINK_TEMPLATE = '''\
4544 [InternetShortcut]
4545 URL=%(url)s
4546 '''
4547
4548 DOT_WEBLOC_LINK_TEMPLATE = '''\
4549 <?xml version="1.0" encoding="UTF-8"?>
4550 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4551 <plist version="1.0">
4552 <dict>
4553 \t<key>URL</key>
4554 \t<string>%(url)s</string>
4555 </dict>
4556 </plist>
4557 '''
4558
4559 DOT_DESKTOP_LINK_TEMPLATE = '''\
4560 [Desktop Entry]
4561 Encoding=UTF-8
4562 Name=%(filename)s
4563 Type=Link
4564 URL=%(url)s
4565 Icon=text-html
4566 '''
4567
4568 LINK_TEMPLATES = {
4569 'url': DOT_URL_LINK_TEMPLATE,
4570 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4571 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4572 }
4573
4574
4575 def iri_to_uri(iri):
4576 """
4577 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4578
4579 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4580 """
4581
4582 iri_parts = urllib.parse.urlparse(iri)
4583
4584 if '[' in iri_parts.netloc:
4585 raise ValueError('IPv6 URIs are not, yet, supported.')
4586 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4587
4588 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4589
4590 net_location = ''
4591 if iri_parts.username:
4592 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4593 if iri_parts.password is not None:
4594 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4595 net_location += '@'
4596
4597 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
4598 # The 'idna' encoding produces ASCII text.
4599 if iri_parts.port is not None and iri_parts.port != 80:
4600 net_location += ':' + str(iri_parts.port)
4601
4602 return urllib.parse.urlunparse(
4603 (iri_parts.scheme,
4604 net_location,
4605
4606 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4607
4608 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4609 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4610
4611 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4612 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4613
4614 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4615
4616 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4617
4618
4619 def to_high_limit_path(path):
4620 if sys.platform in ['win32', 'cygwin']:
4621 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4622 return '\\\\?\\' + os.path.abspath(path)
4623
4624 return path
4625
4626
4627 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
4628 val = traversal.traverse_obj(obj, *variadic(field))
4629 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
4630 return default
4631 return template % func(val)
4632
4633
4634 def clean_podcast_url(url):
4635 url = re.sub(r'''(?x)
4636 (?:
4637 (?:
4638 chtbl\.com/track|
4639 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4640 play\.podtrac\.com|
4641 chrt\.fm/track|
4642 mgln\.ai/e
4643 )(?:/[^/.]+)?|
4644 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4645 flex\.acast\.com|
4646 pd(?:
4647 cn\.co| # https://podcorn.com/analytics-prefix/
4648 st\.fm # https://podsights.com/docs/
4649 )/e|
4650 [0-9]\.gum\.fm|
4651 pscrb\.fm/rss/p
4652 )/''', '', url)
4653 return re.sub(r'^\w+://(\w+://)', r'\1', url)
4654
4655
4656 _HEX_TABLE = '0123456789abcdef'
4657
4658
4659 def random_uuidv4():
4660 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4661
4662
4663 def make_dir(path, to_screen=None):
4664 try:
4665 dn = os.path.dirname(path)
4666 if dn:
4667 os.makedirs(dn, exist_ok=True)
4668 return True
4669 except OSError as err:
4670 if callable(to_screen) is not None:
4671 to_screen(f'unable to create directory {err}')
4672 return False
4673
4674
4675 def get_executable_path():
4676 from ..update import _get_variant_and_executable_path
4677
4678 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4679
4680
4681 def get_user_config_dirs(package_name):
4682 # .config (e.g. ~/.config/package_name)
4683 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
4684 yield os.path.join(xdg_config_home, package_name)
4685
4686 # appdata (%APPDATA%/package_name)
4687 appdata_dir = os.getenv('appdata')
4688 if appdata_dir:
4689 yield os.path.join(appdata_dir, package_name)
4690
4691 # home (~/.package_name)
4692 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
4693
4694
4695 def get_system_config_dirs(package_name):
4696 # /etc/package_name
4697 yield os.path.join('/etc', package_name)
4698
4699
4700 def time_seconds(**kwargs):
4701 """
4702 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
4703 """
4704 return time.time() + datetime.timedelta(**kwargs).total_seconds()
4705
4706
4707 # create a JSON Web Signature (jws) with HS256 algorithm
4708 # the resulting format is in JWS Compact Serialization
4709 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
4710 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
4711 def jwt_encode_hs256(payload_data, key, headers={}):
4712 header_data = {
4713 'alg': 'HS256',
4714 'typ': 'JWT',
4715 }
4716 if headers:
4717 header_data.update(headers)
4718 header_b64 = base64.b64encode(json.dumps(header_data).encode())
4719 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
4720 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
4721 signature_b64 = base64.b64encode(h.digest())
4722 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
4723 return token
4724
4725
4726 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
4727 def jwt_decode_hs256(jwt):
4728 header_b64, payload_b64, signature_b64 = jwt.split('.')
4729 # add trailing ='s that may have been stripped, superfluous ='s are ignored
4730 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
4731 return payload_data
4732
4733
4734 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
4735
4736
4737 @functools.cache
4738 def supports_terminal_sequences(stream):
4739 if compat_os_name == 'nt':
4740 if not WINDOWS_VT_MODE:
4741 return False
4742 elif not os.getenv('TERM'):
4743 return False
4744 try:
4745 return stream.isatty()
4746 except BaseException:
4747 return False
4748
4749
4750 def windows_enable_vt_mode():
4751 """Ref: https://bugs.python.org/issue30075 """
4752 if get_windows_version() < (10, 0, 10586):
4753 return
4754
4755 import ctypes
4756 import ctypes.wintypes
4757 import msvcrt
4758
4759 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
4760
4761 dll = ctypes.WinDLL('kernel32', use_last_error=False)
4762 handle = os.open('CONOUT$', os.O_RDWR)
4763 try:
4764 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
4765 dw_original_mode = ctypes.wintypes.DWORD()
4766 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
4767 if not success:
4768 raise Exception('GetConsoleMode failed')
4769
4770 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
4771 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
4772 if not success:
4773 raise Exception('SetConsoleMode failed')
4774 finally:
4775 os.close(handle)
4776
4777 global WINDOWS_VT_MODE
4778 WINDOWS_VT_MODE = True
4779 supports_terminal_sequences.cache_clear()
4780
4781
4782 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
4783
4784
4785 def remove_terminal_sequences(string):
4786 return _terminal_sequences_re.sub('', string)
4787
4788
4789 def number_of_digits(number):
4790 return len('%d' % number)
4791
4792
4793 def join_nonempty(*values, delim='-', from_dict=None):
4794 if from_dict is not None:
4795 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
4796 return delim.join(map(str, filter(None, values)))
4797
4798
4799 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
4800 """
4801 Find the largest format dimensions in terms of video width and, for each thumbnail:
4802 * Modify the URL: Match the width with the provided regex and replace with the former width
4803 * Update dimensions
4804
4805 This function is useful with video services that scale the provided thumbnails on demand
4806 """
4807 _keys = ('width', 'height')
4808 max_dimensions = max(
4809 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
4810 default=(0, 0))
4811 if not max_dimensions[0]:
4812 return thumbnails
4813 return [
4814 merge_dicts(
4815 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
4816 dict(zip(_keys, max_dimensions)), thumbnail)
4817 for thumbnail in thumbnails
4818 ]
4819
4820
4821 def parse_http_range(range):
4822 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
4823 if not range:
4824 return None, None, None
4825 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
4826 if not crg:
4827 return None, None, None
4828 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
4829
4830
4831 def read_stdin(what):
4832 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
4833 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
4834 return sys.stdin
4835
4836
4837 def determine_file_encoding(data):
4838 """
4839 Detect the text encoding used
4840 @returns (encoding, bytes to skip)
4841 """
4842
4843 # BOM marks are given priority over declarations
4844 for bom, enc in BOMS:
4845 if data.startswith(bom):
4846 return enc, len(bom)
4847
4848 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
4849 # We ignore the endianness to get a good enough match
4850 data = data.replace(b'\0', b'')
4851 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
4852 return mobj.group(1).decode() if mobj else None, 0
4853
4854
4855 class Config:
4856 own_args = None
4857 parsed_args = None
4858 filename = None
4859 __initialized = False
4860
4861 def __init__(self, parser, label=None):
4862 self.parser, self.label = parser, label
4863 self._loaded_paths, self.configs = set(), []
4864
4865 def init(self, args=None, filename=None):
4866 assert not self.__initialized
4867 self.own_args, self.filename = args, filename
4868 return self.load_configs()
4869
4870 def load_configs(self):
4871 directory = ''
4872 if self.filename:
4873 location = os.path.realpath(self.filename)
4874 directory = os.path.dirname(location)
4875 if location in self._loaded_paths:
4876 return False
4877 self._loaded_paths.add(location)
4878
4879 self.__initialized = True
4880 opts, _ = self.parser.parse_known_args(self.own_args)
4881 self.parsed_args = self.own_args
4882 for location in opts.config_locations or []:
4883 if location == '-':
4884 if location in self._loaded_paths:
4885 continue
4886 self._loaded_paths.add(location)
4887 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
4888 continue
4889 location = os.path.join(directory, expand_path(location))
4890 if os.path.isdir(location):
4891 location = os.path.join(location, 'yt-dlp.conf')
4892 if not os.path.exists(location):
4893 self.parser.error(f'config location {location} does not exist')
4894 self.append_config(self.read_file(location), location)
4895 return True
4896
4897 def __str__(self):
4898 label = join_nonempty(
4899 self.label, 'config', f'"{self.filename}"' if self.filename else '',
4900 delim=' ')
4901 return join_nonempty(
4902 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
4903 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
4904 delim='\n')
4905
4906 @staticmethod
4907 def read_file(filename, default=[]):
4908 try:
4909 optionf = open(filename, 'rb')
4910 except OSError:
4911 return default # silently skip if file is not present
4912 try:
4913 enc, skip = determine_file_encoding(optionf.read(512))
4914 optionf.seek(skip, io.SEEK_SET)
4915 except OSError:
4916 enc = None # silently skip read errors
4917 try:
4918 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
4919 contents = optionf.read().decode(enc or preferredencoding())
4920 res = shlex.split(contents, comments=True)
4921 except Exception as err:
4922 raise ValueError(f'Unable to parse "{filename}": {err}')
4923 finally:
4924 optionf.close()
4925 return res
4926
4927 @staticmethod
4928 def hide_login_info(opts):
4929 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
4930 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
4931
4932 def _scrub_eq(o):
4933 m = eqre.match(o)
4934 if m:
4935 return m.group('key') + '=PRIVATE'
4936 else:
4937 return o
4938
4939 opts = list(map(_scrub_eq, opts))
4940 for idx, opt in enumerate(opts):
4941 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
4942 opts[idx + 1] = 'PRIVATE'
4943 return opts
4944
4945 def append_config(self, *args, label=None):
4946 config = type(self)(self.parser, label)
4947 config._loaded_paths = self._loaded_paths
4948 if config.init(*args):
4949 self.configs.append(config)
4950
4951 @property
4952 def all_args(self):
4953 for config in reversed(self.configs):
4954 yield from config.all_args
4955 yield from self.parsed_args or []
4956
4957 def parse_known_args(self, **kwargs):
4958 return self.parser.parse_known_args(self.all_args, **kwargs)
4959
4960 def parse_args(self):
4961 return self.parser.parse_args(self.all_args)
4962
4963
4964 class WebSocketsWrapper:
4965 """Wraps websockets module to use in non-async scopes"""
4966 pool = None
4967
4968 def __init__(self, url, headers=None, connect=True):
4969 self.loop = asyncio.new_event_loop()
4970 # XXX: "loop" is deprecated
4971 self.conn = websockets.connect(
4972 url, extra_headers=headers, ping_interval=None,
4973 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
4974 if connect:
4975 self.__enter__()
4976 atexit.register(self.__exit__, None, None, None)
4977
4978 def __enter__(self):
4979 if not self.pool:
4980 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
4981 return self
4982
4983 def send(self, *args):
4984 self.run_with_loop(self.pool.send(*args), self.loop)
4985
4986 def recv(self, *args):
4987 return self.run_with_loop(self.pool.recv(*args), self.loop)
4988
4989 def __exit__(self, type, value, traceback):
4990 try:
4991 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
4992 finally:
4993 self.loop.close()
4994 self._cancel_all_tasks(self.loop)
4995
4996 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
4997 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
4998 @staticmethod
4999 def run_with_loop(main, loop):
5000 if not asyncio.iscoroutine(main):
5001 raise ValueError(f'a coroutine was expected, got {main!r}')
5002
5003 try:
5004 return loop.run_until_complete(main)
5005 finally:
5006 loop.run_until_complete(loop.shutdown_asyncgens())
5007 if hasattr(loop, 'shutdown_default_executor'):
5008 loop.run_until_complete(loop.shutdown_default_executor())
5009
5010 @staticmethod
5011 def _cancel_all_tasks(loop):
5012 to_cancel = asyncio.all_tasks(loop)
5013
5014 if not to_cancel:
5015 return
5016
5017 for task in to_cancel:
5018 task.cancel()
5019
5020 # XXX: "loop" is removed in python 3.10+
5021 loop.run_until_complete(
5022 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5023
5024 for task in to_cancel:
5025 if task.cancelled():
5026 continue
5027 if task.exception() is not None:
5028 loop.call_exception_handler({
5029 'message': 'unhandled exception during asyncio.run() shutdown',
5030 'exception': task.exception(),
5031 'task': task,
5032 })
5033
5034
5035 def merge_headers(*dicts):
5036 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5037 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5038
5039
5040 def cached_method(f):
5041 """Cache a method"""
5042 signature = inspect.signature(f)
5043
5044 @functools.wraps(f)
5045 def wrapper(self, *args, **kwargs):
5046 bound_args = signature.bind(self, *args, **kwargs)
5047 bound_args.apply_defaults()
5048 key = tuple(bound_args.arguments.values())[1:]
5049
5050 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5051 if key not in cache:
5052 cache[key] = f(self, *args, **kwargs)
5053 return cache[key]
5054 return wrapper
5055
5056
5057 class classproperty:
5058 """property access for class methods with optional caching"""
5059 def __new__(cls, func=None, *args, **kwargs):
5060 if not func:
5061 return functools.partial(cls, *args, **kwargs)
5062 return super().__new__(cls)
5063
5064 def __init__(self, func, *, cache=False):
5065 functools.update_wrapper(self, func)
5066 self.func = func
5067 self._cache = {} if cache else None
5068
5069 def __get__(self, _, cls):
5070 if self._cache is None:
5071 return self.func(cls)
5072 elif cls not in self._cache:
5073 self._cache[cls] = self.func(cls)
5074 return self._cache[cls]
5075
5076
5077 class function_with_repr:
5078 def __init__(self, func, repr_=None):
5079 functools.update_wrapper(self, func)
5080 self.func, self.__repr = func, repr_
5081
5082 def __call__(self, *args, **kwargs):
5083 return self.func(*args, **kwargs)
5084
5085 def __repr__(self):
5086 if self.__repr:
5087 return self.__repr
5088 return f'{self.func.__module__}.{self.func.__qualname__}'
5089
5090
5091 class Namespace(types.SimpleNamespace):
5092 """Immutable namespace"""
5093
5094 def __iter__(self):
5095 return iter(self.__dict__.values())
5096
5097 @property
5098 def items_(self):
5099 return self.__dict__.items()
5100
5101
5102 MEDIA_EXTENSIONS = Namespace(
5103 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5104 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5105 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5106 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5107 thumbnails=('jpg', 'png', 'webp'),
5108 storyboards=('mhtml', ),
5109 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5110 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5111 )
5112 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5113 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5114
5115 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5116
5117
5118 class RetryManager:
5119 """Usage:
5120 for retry in RetryManager(...):
5121 try:
5122 ...
5123 except SomeException as err:
5124 retry.error = err
5125 continue
5126 """
5127 attempt, _error = 0, None
5128
5129 def __init__(self, _retries, _error_callback, **kwargs):
5130 self.retries = _retries or 0
5131 self.error_callback = functools.partial(_error_callback, **kwargs)
5132
5133 def _should_retry(self):
5134 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5135
5136 @property
5137 def error(self):
5138 if self._error is NO_DEFAULT:
5139 return None
5140 return self._error
5141
5142 @error.setter
5143 def error(self, value):
5144 self._error = value
5145
5146 def __iter__(self):
5147 while self._should_retry():
5148 self.error = NO_DEFAULT
5149 self.attempt += 1
5150 yield self
5151 if self.error:
5152 self.error_callback(self.error, self.attempt, self.retries)
5153
5154 @staticmethod
5155 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5156 """Utility function for reporting retries"""
5157 if count > retries:
5158 if error:
5159 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5160 raise e
5161
5162 if not count:
5163 return warn(e)
5164 elif isinstance(e, ExtractorError):
5165 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5166 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5167
5168 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5169 if delay:
5170 info(f'Sleeping {delay:.2f} seconds ...')
5171 time.sleep(delay)
5172
5173
5174 def make_archive_id(ie, video_id):
5175 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5176 return f'{ie_key.lower()} {video_id}'
5177
5178
5179 def truncate_string(s, left, right=0):
5180 assert left > 3 and right >= 0
5181 if s is None or len(s) <= left + right:
5182 return s
5183 return f'{s[:left-3]}...{s[-right:] if right else ""}'
5184
5185
5186 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5187 assert 'all' in alias_dict, '"all" alias is required'
5188 requested = list(start or [])
5189 for val in options:
5190 discard = val.startswith('-')
5191 if discard:
5192 val = val[1:]
5193
5194 if val in alias_dict:
5195 val = alias_dict[val] if not discard else [
5196 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5197 # NB: Do not allow regex in aliases for performance
5198 requested = orderedSet_from_options(val, alias_dict, start=requested)
5199 continue
5200
5201 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5202 else [val] if val in alias_dict['all'] else None)
5203 if current is None:
5204 raise ValueError(val)
5205
5206 if discard:
5207 for item in current:
5208 while item in requested:
5209 requested.remove(item)
5210 else:
5211 requested.extend(current)
5212
5213 return orderedSet(requested)
5214
5215
5216 # TODO: Rewrite
5217 class FormatSorter:
5218 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5219
5220 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5221 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5222 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5223 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5224 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5225 'fps', 'fs_approx', 'source', 'id')
5226
5227 settings = {
5228 'vcodec': {'type': 'ordered', 'regex': True,
5229 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5230 'acodec': {'type': 'ordered', 'regex': True,
5231 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5232 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5233 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5234 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5235 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5236 'vext': {'type': 'ordered', 'field': 'video_ext',
5237 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5238 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5239 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5240 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5241 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5242 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5243 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5244 'field': ('vcodec', 'acodec'),
5245 'function': lambda it: int(any(v != 'none' for v in it))},
5246 'ie_pref': {'priority': True, 'type': 'extractor'},
5247 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5248 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5249 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5250 'quality': {'convert': 'float', 'default': -1},
5251 'filesize': {'convert': 'bytes'},
5252 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5253 'id': {'convert': 'string', 'field': 'format_id'},
5254 'height': {'convert': 'float_none'},
5255 'width': {'convert': 'float_none'},
5256 'fps': {'convert': 'float_none'},
5257 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5258 'tbr': {'convert': 'float_none'},
5259 'vbr': {'convert': 'float_none'},
5260 'abr': {'convert': 'float_none'},
5261 'asr': {'convert': 'float_none'},
5262 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5263
5264 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5265 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5266 'function': lambda it: next(filter(None, it), None)},
5267 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5268 'function': lambda it: next(filter(None, it), None)},
5269 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5270 'res': {'type': 'multiple', 'field': ('height', 'width'),
5271 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5272
5273 # Actual field names
5274 'format_id': {'type': 'alias', 'field': 'id'},
5275 'preference': {'type': 'alias', 'field': 'ie_pref'},
5276 'language_preference': {'type': 'alias', 'field': 'lang'},
5277 'source_preference': {'type': 'alias', 'field': 'source'},
5278 'protocol': {'type': 'alias', 'field': 'proto'},
5279 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5280 'audio_channels': {'type': 'alias', 'field': 'channels'},
5281
5282 # Deprecated
5283 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5284 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5285 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5286 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5287 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5288 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5289 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5290 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5291 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5292 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5293 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5294 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5295 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5296 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5297 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5298 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5299 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5300 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5301 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5302 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5303 }
5304
5305 def __init__(self, ydl, field_preference):
5306 self.ydl = ydl
5307 self._order = []
5308 self.evaluate_params(self.ydl.params, field_preference)
5309 if ydl.params.get('verbose'):
5310 self.print_verbose_info(self.ydl.write_debug)
5311
5312 def _get_field_setting(self, field, key):
5313 if field not in self.settings:
5314 if key in ('forced', 'priority'):
5315 return False
5316 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5317 'deprecated and may be removed in a future version')
5318 self.settings[field] = {}
5319 propObj = self.settings[field]
5320 if key not in propObj:
5321 type = propObj.get('type')
5322 if key == 'field':
5323 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5324 elif key == 'convert':
5325 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5326 else:
5327 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5328 propObj[key] = default
5329 return propObj[key]
5330
5331 def _resolve_field_value(self, field, value, convertNone=False):
5332 if value is None:
5333 if not convertNone:
5334 return None
5335 else:
5336 value = value.lower()
5337 conversion = self._get_field_setting(field, 'convert')
5338 if conversion == 'ignore':
5339 return None
5340 if conversion == 'string':
5341 return value
5342 elif conversion == 'float_none':
5343 return float_or_none(value)
5344 elif conversion == 'bytes':
5345 return parse_bytes(value)
5346 elif conversion == 'order':
5347 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5348 use_regex = self._get_field_setting(field, 'regex')
5349 list_length = len(order_list)
5350 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5351 if use_regex and value is not None:
5352 for i, regex in enumerate(order_list):
5353 if regex and re.match(regex, value):
5354 return list_length - i
5355 return list_length - empty_pos # not in list
5356 else: # not regex or value = None
5357 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5358 else:
5359 if value.isnumeric():
5360 return float(value)
5361 else:
5362 self.settings[field]['convert'] = 'string'
5363 return value
5364
5365 def evaluate_params(self, params, sort_extractor):
5366 self._use_free_order = params.get('prefer_free_formats', False)
5367 self._sort_user = params.get('format_sort', [])
5368 self._sort_extractor = sort_extractor
5369
5370 def add_item(field, reverse, closest, limit_text):
5371 field = field.lower()
5372 if field in self._order:
5373 return
5374 self._order.append(field)
5375 limit = self._resolve_field_value(field, limit_text)
5376 data = {
5377 'reverse': reverse,
5378 'closest': False if limit is None else closest,
5379 'limit_text': limit_text,
5380 'limit': limit}
5381 if field in self.settings:
5382 self.settings[field].update(data)
5383 else:
5384 self.settings[field] = data
5385
5386 sort_list = (
5387 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5388 + (tuple() if params.get('format_sort_force', False)
5389 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5390 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5391
5392 for item in sort_list:
5393 match = re.match(self.regex, item)
5394 if match is None:
5395 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5396 field = match.group('field')
5397 if field is None:
5398 continue
5399 if self._get_field_setting(field, 'type') == 'alias':
5400 alias, field = field, self._get_field_setting(field, 'field')
5401 if self._get_field_setting(alias, 'deprecated'):
5402 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5403 f'be removed in a future version. Please use {field} instead')
5404 reverse = match.group('reverse') is not None
5405 closest = match.group('separator') == '~'
5406 limit_text = match.group('limit')
5407
5408 has_limit = limit_text is not None
5409 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5410 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5411
5412 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5413 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5414 limit_count = len(limits)
5415 for (i, f) in enumerate(fields):
5416 add_item(f, reverse, closest,
5417 limits[i] if i < limit_count
5418 else limits[0] if has_limit and not has_multiple_limits
5419 else None)
5420
5421 def print_verbose_info(self, write_debug):
5422 if self._sort_user:
5423 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5424 if self._sort_extractor:
5425 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5426 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5427 '+' if self._get_field_setting(field, 'reverse') else '', field,
5428 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5429 self._get_field_setting(field, 'limit_text'),
5430 self._get_field_setting(field, 'limit'))
5431 if self._get_field_setting(field, 'limit_text') is not None else '')
5432 for field in self._order if self._get_field_setting(field, 'visible')]))
5433
5434 def _calculate_field_preference_from_value(self, format, field, type, value):
5435 reverse = self._get_field_setting(field, 'reverse')
5436 closest = self._get_field_setting(field, 'closest')
5437 limit = self._get_field_setting(field, 'limit')
5438
5439 if type == 'extractor':
5440 maximum = self._get_field_setting(field, 'max')
5441 if value is None or (maximum is not None and value >= maximum):
5442 value = -1
5443 elif type == 'boolean':
5444 in_list = self._get_field_setting(field, 'in_list')
5445 not_in_list = self._get_field_setting(field, 'not_in_list')
5446 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5447 elif type == 'ordered':
5448 value = self._resolve_field_value(field, value, True)
5449
5450 # try to convert to number
5451 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5452 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5453 if is_num:
5454 value = val_num
5455
5456 return ((-10, 0) if value is None
5457 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5458 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5459 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5460 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5461 else (-1, value, 0))
5462
5463 def _calculate_field_preference(self, format, field):
5464 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5465 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5466 if type == 'multiple':
5467 type = 'field' # Only 'field' is allowed in multiple for now
5468 actual_fields = self._get_field_setting(field, 'field')
5469
5470 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5471 else:
5472 value = get_value(field)
5473 return self._calculate_field_preference_from_value(format, field, type, value)
5474
5475 def calculate_preference(self, format):
5476 # Determine missing protocol
5477 if not format.get('protocol'):
5478 format['protocol'] = determine_protocol(format)
5479
5480 # Determine missing ext
5481 if not format.get('ext') and 'url' in format:
5482 format['ext'] = determine_ext(format['url'])
5483 if format.get('vcodec') == 'none':
5484 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5485 format['video_ext'] = 'none'
5486 else:
5487 format['video_ext'] = format['ext']
5488 format['audio_ext'] = 'none'
5489 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5490 # format['preference'] = -1000
5491
5492 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5493 # HEVC-over-FLV is out-of-spec by FLV's original spec
5494 # ref. https://trac.ffmpeg.org/ticket/6389
5495 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5496 format['preference'] = -100
5497
5498 # Determine missing bitrates
5499 if format.get('vcodec') == 'none':
5500 format['vbr'] = 0
5501 if format.get('acodec') == 'none':
5502 format['abr'] = 0
5503 if not format.get('vbr') and format.get('vcodec') != 'none':
5504 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5505 if not format.get('abr') and format.get('acodec') != 'none':
5506 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5507 if not format.get('tbr'):
5508 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5509
5510 return tuple(self._calculate_field_preference(format, field) for field in self._order)
5511
5512
5513 # XXX: Temporary
5514 class _YDLLogger:
5515 def __init__(self, ydl=None):
5516 self._ydl = ydl
5517
5518 def debug(self, message):
5519 if self._ydl:
5520 self._ydl.write_debug(message)
5521
5522 def info(self, message):
5523 if self._ydl:
5524 self._ydl.to_screen(message)
5525
5526 def warning(self, message, *, once=False):
5527 if self._ydl:
5528 self._ydl.report_warning(message, only_once=once)
5529
5530 def error(self, message, *, is_error=True):
5531 if self._ydl:
5532 self._ydl.report_error(message, is_error=is_error)
5533
5534 def stdout(self, message):
5535 if self._ydl:
5536 self._ydl.to_stdout(message)
5537
5538 def stderr(self, message):
5539 if self._ydl:
5540 self._ydl.to_stderr(message)