]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils/_utils.py
[devscripts/cli_to_api] Add script
[yt-dlp.git] / yt_dlp / utils / _utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import html.entities
18 import html.parser
19 import http.client
20 import http.cookiejar
21 import inspect
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import mimetypes
28 import operator
29 import os
30 import platform
31 import random
32 import re
33 import shlex
34 import socket
35 import ssl
36 import struct
37 import subprocess
38 import sys
39 import tempfile
40 import time
41 import traceback
42 import types
43 import unicodedata
44 import urllib.error
45 import urllib.parse
46 import urllib.request
47 import xml.etree.ElementTree
48 import zlib
49
50 from . import traversal
51
52 from ..compat import functools # isort: split
53 from ..compat import (
54 compat_etree_fromstring,
55 compat_expanduser,
56 compat_HTMLParseError,
57 compat_os_name,
58 compat_shlex_quote,
59 )
60 from ..dependencies import brotli, certifi, websockets, xattr
61 from ..socks import ProxyType, sockssocket
62
63 __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
64
65 # This is not clearly defined otherwise
66 compiled_regex_type = type(re.compile(''))
67
68
69 def random_user_agent():
70 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
71 _CHROME_VERSIONS = (
72 '90.0.4430.212',
73 '90.0.4430.24',
74 '90.0.4430.70',
75 '90.0.4430.72',
76 '90.0.4430.85',
77 '90.0.4430.93',
78 '91.0.4472.101',
79 '91.0.4472.106',
80 '91.0.4472.114',
81 '91.0.4472.124',
82 '91.0.4472.164',
83 '91.0.4472.19',
84 '91.0.4472.77',
85 '92.0.4515.107',
86 '92.0.4515.115',
87 '92.0.4515.131',
88 '92.0.4515.159',
89 '92.0.4515.43',
90 '93.0.4556.0',
91 '93.0.4577.15',
92 '93.0.4577.63',
93 '93.0.4577.82',
94 '94.0.4606.41',
95 '94.0.4606.54',
96 '94.0.4606.61',
97 '94.0.4606.71',
98 '94.0.4606.81',
99 '94.0.4606.85',
100 '95.0.4638.17',
101 '95.0.4638.50',
102 '95.0.4638.54',
103 '95.0.4638.69',
104 '95.0.4638.74',
105 '96.0.4664.18',
106 '96.0.4664.45',
107 '96.0.4664.55',
108 '96.0.4664.93',
109 '97.0.4692.20',
110 )
111 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
112
113
114 SUPPORTED_ENCODINGS = [
115 'gzip', 'deflate'
116 ]
117 if brotli:
118 SUPPORTED_ENCODINGS.append('br')
119
120 std_headers = {
121 'User-Agent': random_user_agent(),
122 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
123 'Accept-Language': 'en-us,en;q=0.5',
124 'Sec-Fetch-Mode': 'navigate',
125 }
126
127
128 USER_AGENTS = {
129 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
130 }
131
132
133 NO_DEFAULT = object()
134 IDENTITY = lambda x: x
135
136 ENGLISH_MONTH_NAMES = [
137 'January', 'February', 'March', 'April', 'May', 'June',
138 'July', 'August', 'September', 'October', 'November', 'December']
139
140 MONTH_NAMES = {
141 'en': ENGLISH_MONTH_NAMES,
142 'fr': [
143 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
144 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
145 # these follow the genitive grammatical case (dopełniacz)
146 # some websites might be using nominative, which will require another month list
147 # https://en.wikibooks.org/wiki/Polish/Noun_cases
148 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
149 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
150 }
151
152 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
153 TIMEZONE_NAMES = {
154 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
155 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
156 'EST': -5, 'EDT': -4, # Eastern
157 'CST': -6, 'CDT': -5, # Central
158 'MST': -7, 'MDT': -6, # Mountain
159 'PST': -8, 'PDT': -7 # Pacific
160 }
161
162 # needed for sanitizing filenames in restricted mode
163 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
164 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
165 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
166
167 DATE_FORMATS = (
168 '%d %B %Y',
169 '%d %b %Y',
170 '%B %d %Y',
171 '%B %dst %Y',
172 '%B %dnd %Y',
173 '%B %drd %Y',
174 '%B %dth %Y',
175 '%b %d %Y',
176 '%b %dst %Y',
177 '%b %dnd %Y',
178 '%b %drd %Y',
179 '%b %dth %Y',
180 '%b %dst %Y %I:%M',
181 '%b %dnd %Y %I:%M',
182 '%b %drd %Y %I:%M',
183 '%b %dth %Y %I:%M',
184 '%Y %m %d',
185 '%Y-%m-%d',
186 '%Y.%m.%d.',
187 '%Y/%m/%d',
188 '%Y/%m/%d %H:%M',
189 '%Y/%m/%d %H:%M:%S',
190 '%Y%m%d%H%M',
191 '%Y%m%d%H%M%S',
192 '%Y%m%d',
193 '%Y-%m-%d %H:%M',
194 '%Y-%m-%d %H:%M:%S',
195 '%Y-%m-%d %H:%M:%S.%f',
196 '%Y-%m-%d %H:%M:%S:%f',
197 '%d.%m.%Y %H:%M',
198 '%d.%m.%Y %H.%M',
199 '%Y-%m-%dT%H:%M:%SZ',
200 '%Y-%m-%dT%H:%M:%S.%fZ',
201 '%Y-%m-%dT%H:%M:%S.%f0Z',
202 '%Y-%m-%dT%H:%M:%S',
203 '%Y-%m-%dT%H:%M:%S.%f',
204 '%Y-%m-%dT%H:%M',
205 '%b %d %Y at %H:%M',
206 '%b %d %Y at %H:%M:%S',
207 '%B %d %Y at %H:%M',
208 '%B %d %Y at %H:%M:%S',
209 '%H:%M %d-%b-%Y',
210 )
211
212 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
213 DATE_FORMATS_DAY_FIRST.extend([
214 '%d-%m-%Y',
215 '%d.%m.%Y',
216 '%d.%m.%y',
217 '%d/%m/%Y',
218 '%d/%m/%y',
219 '%d/%m/%Y %H:%M:%S',
220 '%d-%m-%Y %H:%M',
221 ])
222
223 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
224 DATE_FORMATS_MONTH_FIRST.extend([
225 '%m-%d-%Y',
226 '%m.%d.%Y',
227 '%m/%d/%Y',
228 '%m/%d/%y',
229 '%m/%d/%Y %H:%M:%S',
230 ])
231
232 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
233 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
234
235 NUMBER_RE = r'\d+(?:\.\d+)?'
236
237
238 @functools.cache
239 def preferredencoding():
240 """Get preferred encoding.
241
242 Returns the best encoding scheme for the system, based on
243 locale.getpreferredencoding() and some further tweaks.
244 """
245 try:
246 pref = locale.getpreferredencoding()
247 'TEST'.encode(pref)
248 except Exception:
249 pref = 'UTF-8'
250
251 return pref
252
253
254 def write_json_file(obj, fn):
255 """ Encode obj as JSON and write it to fn, atomically if possible """
256
257 tf = tempfile.NamedTemporaryFile(
258 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
259 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
260
261 try:
262 with tf:
263 json.dump(obj, tf, ensure_ascii=False)
264 if sys.platform == 'win32':
265 # Need to remove existing file on Windows, else os.rename raises
266 # WindowsError or FileExistsError.
267 with contextlib.suppress(OSError):
268 os.unlink(fn)
269 with contextlib.suppress(OSError):
270 mask = os.umask(0)
271 os.umask(mask)
272 os.chmod(tf.name, 0o666 & ~mask)
273 os.rename(tf.name, fn)
274 except Exception:
275 with contextlib.suppress(OSError):
276 os.remove(tf.name)
277 raise
278
279
280 def find_xpath_attr(node, xpath, key, val=None):
281 """ Find the xpath xpath[@key=val] """
282 assert re.match(r'^[a-zA-Z_-]+$', key)
283 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
284 return node.find(expr)
285
286 # On python2.6 the xml.etree.ElementTree.Element methods don't support
287 # the namespace parameter
288
289
290 def xpath_with_ns(path, ns_map):
291 components = [c.split(':') for c in path.split('/')]
292 replaced = []
293 for c in components:
294 if len(c) == 1:
295 replaced.append(c[0])
296 else:
297 ns, tag = c
298 replaced.append('{%s}%s' % (ns_map[ns], tag))
299 return '/'.join(replaced)
300
301
302 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
303 def _find_xpath(xpath):
304 return node.find(xpath)
305
306 if isinstance(xpath, str):
307 n = _find_xpath(xpath)
308 else:
309 for xp in xpath:
310 n = _find_xpath(xp)
311 if n is not None:
312 break
313
314 if n is None:
315 if default is not NO_DEFAULT:
316 return default
317 elif fatal:
318 name = xpath if name is None else name
319 raise ExtractorError('Could not find XML element %s' % name)
320 else:
321 return None
322 return n
323
324
325 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
326 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
327 if n is None or n == default:
328 return n
329 if n.text is None:
330 if default is not NO_DEFAULT:
331 return default
332 elif fatal:
333 name = xpath if name is None else name
334 raise ExtractorError('Could not find XML element\'s text %s' % name)
335 else:
336 return None
337 return n.text
338
339
340 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
341 n = find_xpath_attr(node, xpath, key)
342 if n is None:
343 if default is not NO_DEFAULT:
344 return default
345 elif fatal:
346 name = f'{xpath}[@{key}]' if name is None else name
347 raise ExtractorError('Could not find XML attribute %s' % name)
348 else:
349 return None
350 return n.attrib[key]
351
352
353 def get_element_by_id(id, html, **kwargs):
354 """Return the content of the tag with the specified ID in the passed HTML document"""
355 return get_element_by_attribute('id', id, html, **kwargs)
356
357
358 def get_element_html_by_id(id, html, **kwargs):
359 """Return the html of the tag with the specified ID in the passed HTML document"""
360 return get_element_html_by_attribute('id', id, html, **kwargs)
361
362
363 def get_element_by_class(class_name, html):
364 """Return the content of the first tag with the specified class in the passed HTML document"""
365 retval = get_elements_by_class(class_name, html)
366 return retval[0] if retval else None
367
368
369 def get_element_html_by_class(class_name, html):
370 """Return the html of the first tag with the specified class in the passed HTML document"""
371 retval = get_elements_html_by_class(class_name, html)
372 return retval[0] if retval else None
373
374
375 def get_element_by_attribute(attribute, value, html, **kwargs):
376 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
377 return retval[0] if retval else None
378
379
380 def get_element_html_by_attribute(attribute, value, html, **kargs):
381 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
382 return retval[0] if retval else None
383
384
385 def get_elements_by_class(class_name, html, **kargs):
386 """Return the content of all tags with the specified class in the passed HTML document as a list"""
387 return get_elements_by_attribute(
388 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
389 html, escape_value=False)
390
391
392 def get_elements_html_by_class(class_name, html):
393 """Return the html of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_html_by_attribute(
395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
396 html, escape_value=False)
397
398
399 def get_elements_by_attribute(*args, **kwargs):
400 """Return the content of the tag with the specified attribute in the passed HTML document"""
401 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
402
403
404 def get_elements_html_by_attribute(*args, **kwargs):
405 """Return the html of the tag with the specified attribute in the passed HTML document"""
406 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
407
408
409 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
410 """
411 Return the text (content) and the html (whole) of the tag with the specified
412 attribute in the passed HTML document
413 """
414 if not value:
415 return
416
417 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
418
419 value = re.escape(value) if escape_value else value
420
421 partial_element_re = rf'''(?x)
422 <(?P<tag>{tag})
423 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
424 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
425 '''
426
427 for m in re.finditer(partial_element_re, html):
428 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
429
430 yield (
431 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
432 whole
433 )
434
435
436 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
437 """
438 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
439 closing tag for the first opening tag it has encountered, and can be used
440 as a context manager
441 """
442
443 class HTMLBreakOnClosingTagException(Exception):
444 pass
445
446 def __init__(self):
447 self.tagstack = collections.deque()
448 html.parser.HTMLParser.__init__(self)
449
450 def __enter__(self):
451 return self
452
453 def __exit__(self, *_):
454 self.close()
455
456 def close(self):
457 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
458 # so data remains buffered; we no longer have any interest in it, thus
459 # override this method to discard it
460 pass
461
462 def handle_starttag(self, tag, _):
463 self.tagstack.append(tag)
464
465 def handle_endtag(self, tag):
466 if not self.tagstack:
467 raise compat_HTMLParseError('no tags in the stack')
468 while self.tagstack:
469 inner_tag = self.tagstack.pop()
470 if inner_tag == tag:
471 break
472 else:
473 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
474 if not self.tagstack:
475 raise self.HTMLBreakOnClosingTagException()
476
477
478 # XXX: This should be far less strict
479 def get_element_text_and_html_by_tag(tag, html):
480 """
481 For the first element with the specified tag in the passed HTML document
482 return its' content (text) and the whole element (html)
483 """
484 def find_or_raise(haystack, needle, exc):
485 try:
486 return haystack.index(needle)
487 except ValueError:
488 raise exc
489 closing_tag = f'</{tag}>'
490 whole_start = find_or_raise(
491 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
492 content_start = find_or_raise(
493 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
494 content_start += whole_start + 1
495 with HTMLBreakOnClosingTagParser() as parser:
496 parser.feed(html[whole_start:content_start])
497 if not parser.tagstack or parser.tagstack[0] != tag:
498 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
499 offset = content_start
500 while offset < len(html):
501 next_closing_tag_start = find_or_raise(
502 html[offset:], closing_tag,
503 compat_HTMLParseError(f'closing {tag} tag not found'))
504 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
505 try:
506 parser.feed(html[offset:offset + next_closing_tag_end])
507 offset += next_closing_tag_end
508 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
509 return html[content_start:offset + next_closing_tag_start], \
510 html[whole_start:offset + next_closing_tag_end]
511 raise compat_HTMLParseError('unexpected end of html')
512
513
514 class HTMLAttributeParser(html.parser.HTMLParser):
515 """Trivial HTML parser to gather the attributes for a single element"""
516
517 def __init__(self):
518 self.attrs = {}
519 html.parser.HTMLParser.__init__(self)
520
521 def handle_starttag(self, tag, attrs):
522 self.attrs = dict(attrs)
523 raise compat_HTMLParseError('done')
524
525
526 class HTMLListAttrsParser(html.parser.HTMLParser):
527 """HTML parser to gather the attributes for the elements of a list"""
528
529 def __init__(self):
530 html.parser.HTMLParser.__init__(self)
531 self.items = []
532 self._level = 0
533
534 def handle_starttag(self, tag, attrs):
535 if tag == 'li' and self._level == 0:
536 self.items.append(dict(attrs))
537 self._level += 1
538
539 def handle_endtag(self, tag):
540 self._level -= 1
541
542
543 def extract_attributes(html_element):
544 """Given a string for an HTML element such as
545 <el
546 a="foo" B="bar" c="&98;az" d=boz
547 empty= noval entity="&amp;"
548 sq='"' dq="'"
549 >
550 Decode and return a dictionary of attributes.
551 {
552 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
553 'empty': '', 'noval': None, 'entity': '&',
554 'sq': '"', 'dq': '\''
555 }.
556 """
557 parser = HTMLAttributeParser()
558 with contextlib.suppress(compat_HTMLParseError):
559 parser.feed(html_element)
560 parser.close()
561 return parser.attrs
562
563
564 def parse_list(webpage):
565 """Given a string for an series of HTML <li> elements,
566 return a dictionary of their attributes"""
567 parser = HTMLListAttrsParser()
568 parser.feed(webpage)
569 parser.close()
570 return parser.items
571
572
573 def clean_html(html):
574 """Clean an HTML snippet into a readable string"""
575
576 if html is None: # Convenience for sanitizing descriptions etc.
577 return html
578
579 html = re.sub(r'\s+', ' ', html)
580 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
581 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
582 # Strip html tags
583 html = re.sub('<.*?>', '', html)
584 # Replace html entities
585 html = unescapeHTML(html)
586 return html.strip()
587
588
589 class LenientJSONDecoder(json.JSONDecoder):
590 # TODO: Write tests
591 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
592 self.transform_source, self.ignore_extra = transform_source, ignore_extra
593 self._close_attempts = 2 * close_objects
594 super().__init__(*args, **kwargs)
595
596 @staticmethod
597 def _close_object(err):
598 doc = err.doc[:err.pos]
599 # We need to add comma first to get the correct error message
600 if err.msg.startswith('Expecting \',\''):
601 return doc + ','
602 elif not doc.endswith(','):
603 return
604
605 if err.msg.startswith('Expecting property name'):
606 return doc[:-1] + '}'
607 elif err.msg.startswith('Expecting value'):
608 return doc[:-1] + ']'
609
610 def decode(self, s):
611 if self.transform_source:
612 s = self.transform_source(s)
613 for attempt in range(self._close_attempts + 1):
614 try:
615 if self.ignore_extra:
616 return self.raw_decode(s.lstrip())[0]
617 return super().decode(s)
618 except json.JSONDecodeError as e:
619 if e.pos is None:
620 raise
621 elif attempt < self._close_attempts:
622 s = self._close_object(e)
623 if s is not None:
624 continue
625 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
626 assert False, 'Too many attempts to decode JSON'
627
628
629 def sanitize_open(filename, open_mode):
630 """Try to open the given filename, and slightly tweak it if this fails.
631
632 Attempts to open the given filename. If this fails, it tries to change
633 the filename slightly, step by step, until it's either able to open it
634 or it fails and raises a final exception, like the standard open()
635 function.
636
637 It returns the tuple (stream, definitive_file_name).
638 """
639 if filename == '-':
640 if sys.platform == 'win32':
641 import msvcrt
642
643 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
644 with contextlib.suppress(io.UnsupportedOperation):
645 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
646 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
647
648 for attempt in range(2):
649 try:
650 try:
651 if sys.platform == 'win32':
652 # FIXME: An exclusive lock also locks the file from being read.
653 # Since windows locks are mandatory, don't lock the file on windows (for now).
654 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
655 raise LockingUnsupportedError()
656 stream = locked_file(filename, open_mode, block=False).__enter__()
657 except OSError:
658 stream = open(filename, open_mode)
659 return stream, filename
660 except OSError as err:
661 if attempt or err.errno in (errno.EACCES,):
662 raise
663 old_filename, filename = filename, sanitize_path(filename)
664 if old_filename == filename:
665 raise
666
667
668 def timeconvert(timestr):
669 """Convert RFC 2822 defined time string into system timestamp"""
670 timestamp = None
671 timetuple = email.utils.parsedate_tz(timestr)
672 if timetuple is not None:
673 timestamp = email.utils.mktime_tz(timetuple)
674 return timestamp
675
676
677 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
678 """Sanitizes a string so it could be used as part of a filename.
679 @param restricted Use a stricter subset of allowed characters
680 @param is_id Whether this is an ID that should be kept unchanged if possible.
681 If unset, yt-dlp's new sanitization rules are in effect
682 """
683 if s == '':
684 return ''
685
686 def replace_insane(char):
687 if restricted and char in ACCENT_CHARS:
688 return ACCENT_CHARS[char]
689 elif not restricted and char == '\n':
690 return '\0 '
691 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
692 # Replace with their full-width unicode counterparts
693 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
694 elif char == '?' or ord(char) < 32 or ord(char) == 127:
695 return ''
696 elif char == '"':
697 return '' if restricted else '\''
698 elif char == ':':
699 return '\0_\0-' if restricted else '\0 \0-'
700 elif char in '\\/|*<>':
701 return '\0_'
702 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
703 return '\0_'
704 return char
705
706 # Replace look-alike Unicode glyphs
707 if restricted and (is_id is NO_DEFAULT or not is_id):
708 s = unicodedata.normalize('NFKC', s)
709 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
710 result = ''.join(map(replace_insane, s))
711 if is_id is NO_DEFAULT:
712 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
713 STRIP_RE = r'(?:\0.|[ _-])*'
714 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
715 result = result.replace('\0', '') or '_'
716
717 if not is_id:
718 while '__' in result:
719 result = result.replace('__', '_')
720 result = result.strip('_')
721 # Common case of "Foreign band name - English song title"
722 if restricted and result.startswith('-_'):
723 result = result[2:]
724 if result.startswith('-'):
725 result = '_' + result[len('-'):]
726 result = result.lstrip('.')
727 if not result:
728 result = '_'
729 return result
730
731
732 def sanitize_path(s, force=False):
733 """Sanitizes and normalizes path on Windows"""
734 if sys.platform == 'win32':
735 force = False
736 drive_or_unc, _ = os.path.splitdrive(s)
737 elif force:
738 drive_or_unc = ''
739 else:
740 return s
741
742 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
743 if drive_or_unc:
744 norm_path.pop(0)
745 sanitized_path = [
746 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
747 for path_part in norm_path]
748 if drive_or_unc:
749 sanitized_path.insert(0, drive_or_unc + os.path.sep)
750 elif force and s and s[0] == os.path.sep:
751 sanitized_path.insert(0, os.path.sep)
752 return os.path.join(*sanitized_path)
753
754
755 def sanitize_url(url, *, scheme='http'):
756 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
757 # the number of unwanted failures due to missing protocol
758 if url is None:
759 return
760 elif url.startswith('//'):
761 return f'{scheme}:{url}'
762 # Fix some common typos seen so far
763 COMMON_TYPOS = (
764 # https://github.com/ytdl-org/youtube-dl/issues/15649
765 (r'^httpss://', r'https://'),
766 # https://bx1.be/lives/direct-tv/
767 (r'^rmtp([es]?)://', r'rtmp\1://'),
768 )
769 for mistake, fixup in COMMON_TYPOS:
770 if re.match(mistake, url):
771 return re.sub(mistake, fixup, url)
772 return url
773
774
775 def extract_basic_auth(url):
776 parts = urllib.parse.urlsplit(url)
777 if parts.username is None:
778 return url, None
779 url = urllib.parse.urlunsplit(parts._replace(netloc=(
780 parts.hostname if parts.port is None
781 else '%s:%d' % (parts.hostname, parts.port))))
782 auth_payload = base64.b64encode(
783 ('%s:%s' % (parts.username, parts.password or '')).encode())
784 return url, f'Basic {auth_payload.decode()}'
785
786
787 def sanitized_Request(url, *args, **kwargs):
788 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
789 if auth_header is not None:
790 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
791 headers['Authorization'] = auth_header
792 return urllib.request.Request(url, *args, **kwargs)
793
794
795 def expand_path(s):
796 """Expand shell variables and ~"""
797 return os.path.expandvars(compat_expanduser(s))
798
799
800 def orderedSet(iterable, *, lazy=False):
801 """Remove all duplicates from the input iterable"""
802 def _iter():
803 seen = [] # Do not use set since the items can be unhashable
804 for x in iterable:
805 if x not in seen:
806 seen.append(x)
807 yield x
808
809 return _iter() if lazy else list(_iter())
810
811
812 def _htmlentity_transform(entity_with_semicolon):
813 """Transforms an HTML entity to a character."""
814 entity = entity_with_semicolon[:-1]
815
816 # Known non-numeric HTML entity
817 if entity in html.entities.name2codepoint:
818 return chr(html.entities.name2codepoint[entity])
819
820 # TODO: HTML5 allows entities without a semicolon.
821 # E.g. '&Eacuteric' should be decoded as 'Éric'.
822 if entity_with_semicolon in html.entities.html5:
823 return html.entities.html5[entity_with_semicolon]
824
825 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
826 if mobj is not None:
827 numstr = mobj.group(1)
828 if numstr.startswith('x'):
829 base = 16
830 numstr = '0%s' % numstr
831 else:
832 base = 10
833 # See https://github.com/ytdl-org/youtube-dl/issues/7518
834 with contextlib.suppress(ValueError):
835 return chr(int(numstr, base))
836
837 # Unknown entity in name, return its literal representation
838 return '&%s;' % entity
839
840
841 def unescapeHTML(s):
842 if s is None:
843 return None
844 assert isinstance(s, str)
845
846 return re.sub(
847 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
848
849
850 def escapeHTML(text):
851 return (
852 text
853 .replace('&', '&amp;')
854 .replace('<', '&lt;')
855 .replace('>', '&gt;')
856 .replace('"', '&quot;')
857 .replace("'", '&#39;')
858 )
859
860
861 def process_communicate_or_kill(p, *args, **kwargs):
862 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
863 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
864 return Popen.communicate_or_kill(p, *args, **kwargs)
865
866
867 class Popen(subprocess.Popen):
868 if sys.platform == 'win32':
869 _startupinfo = subprocess.STARTUPINFO()
870 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
871 else:
872 _startupinfo = None
873
874 @staticmethod
875 def _fix_pyinstaller_ld_path(env):
876 """Restore LD_LIBRARY_PATH when using PyInstaller
877 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
878 https://github.com/yt-dlp/yt-dlp/issues/4573
879 """
880 if not hasattr(sys, '_MEIPASS'):
881 return
882
883 def _fix(key):
884 orig = env.get(f'{key}_ORIG')
885 if orig is None:
886 env.pop(key, None)
887 else:
888 env[key] = orig
889
890 _fix('LD_LIBRARY_PATH') # Linux
891 _fix('DYLD_LIBRARY_PATH') # macOS
892
893 def __init__(self, *args, env=None, text=False, **kwargs):
894 if env is None:
895 env = os.environ.copy()
896 self._fix_pyinstaller_ld_path(env)
897
898 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
899 if text is True:
900 kwargs['universal_newlines'] = True # For 3.6 compatibility
901 kwargs.setdefault('encoding', 'utf-8')
902 kwargs.setdefault('errors', 'replace')
903 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
904
905 def communicate_or_kill(self, *args, **kwargs):
906 try:
907 return self.communicate(*args, **kwargs)
908 except BaseException: # Including KeyboardInterrupt
909 self.kill(timeout=None)
910 raise
911
912 def kill(self, *, timeout=0):
913 super().kill()
914 if timeout != 0:
915 self.wait(timeout=timeout)
916
917 @classmethod
918 def run(cls, *args, timeout=None, **kwargs):
919 with cls(*args, **kwargs) as proc:
920 default = '' if proc.__text_mode else b''
921 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
922 return stdout or default, stderr or default, proc.returncode
923
924
925 def encodeArgument(s):
926 # Legacy code that uses byte strings
927 # Uncomment the following line after fixing all post processors
928 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
929 return s if isinstance(s, str) else s.decode('ascii')
930
931
932 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
933
934
935 def timetuple_from_msec(msec):
936 secs, msec = divmod(msec, 1000)
937 mins, secs = divmod(secs, 60)
938 hrs, mins = divmod(mins, 60)
939 return _timetuple(hrs, mins, secs, msec)
940
941
942 def formatSeconds(secs, delim=':', msec=False):
943 time = timetuple_from_msec(secs * 1000)
944 if time.hours:
945 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
946 elif time.minutes:
947 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
948 else:
949 ret = '%d' % time.seconds
950 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
951
952
953 def _ssl_load_windows_store_certs(ssl_context, storename):
954 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
955 try:
956 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
957 if encoding == 'x509_asn' and (
958 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
959 except PermissionError:
960 return
961 for cert in certs:
962 with contextlib.suppress(ssl.SSLError):
963 ssl_context.load_verify_locations(cadata=cert)
964
965
966 def make_HTTPS_handler(params, **kwargs):
967 opts_check_certificate = not params.get('nocheckcertificate')
968 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
969 context.check_hostname = opts_check_certificate
970 if params.get('legacyserverconnect'):
971 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
972 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
973 context.set_ciphers('DEFAULT')
974 elif (
975 sys.version_info < (3, 10)
976 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
977 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
978 ):
979 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
980 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
981 # in some situations [2][3].
982 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
983 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
984 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
985 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
986 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
987 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
988 # 4. https://peps.python.org/pep-0644/
989 # 5. https://peps.python.org/pep-0644/#libressl-support
990 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
991 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
992 context.minimum_version = ssl.TLSVersion.TLSv1_2
993
994 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
995 if opts_check_certificate:
996 if certifi and 'no-certifi' not in params.get('compat_opts', []):
997 context.load_verify_locations(cafile=certifi.where())
998 else:
999 try:
1000 context.load_default_certs()
1001 # Work around the issue in load_default_certs when there are bad certificates. See:
1002 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1003 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1004 except ssl.SSLError:
1005 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1006 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1007 for storename in ('CA', 'ROOT'):
1008 _ssl_load_windows_store_certs(context, storename)
1009 context.set_default_verify_paths()
1010
1011 client_certfile = params.get('client_certificate')
1012 if client_certfile:
1013 try:
1014 context.load_cert_chain(
1015 client_certfile, keyfile=params.get('client_certificate_key'),
1016 password=params.get('client_certificate_password'))
1017 except ssl.SSLError:
1018 raise YoutubeDLError('Unable to load client certificate')
1019
1020 # Some servers may reject requests if ALPN extension is not sent. See:
1021 # https://github.com/python/cpython/issues/85140
1022 # https://github.com/yt-dlp/yt-dlp/issues/3878
1023 with contextlib.suppress(NotImplementedError):
1024 context.set_alpn_protocols(['http/1.1'])
1025
1026 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1027
1028
1029 def bug_reports_message(before=';'):
1030 from ..update import REPOSITORY
1031
1032 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1033 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1034
1035 before = before.rstrip()
1036 if not before or before.endswith(('.', '!', '?')):
1037 msg = msg[0].title() + msg[1:]
1038
1039 return (before + ' ' if before else '') + msg
1040
1041
1042 class YoutubeDLError(Exception):
1043 """Base exception for YoutubeDL errors."""
1044 msg = None
1045
1046 def __init__(self, msg=None):
1047 if msg is not None:
1048 self.msg = msg
1049 elif self.msg is None:
1050 self.msg = type(self).__name__
1051 super().__init__(self.msg)
1052
1053
1054 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1055 if hasattr(ssl, 'CertificateError'):
1056 network_exceptions.append(ssl.CertificateError)
1057 network_exceptions = tuple(network_exceptions)
1058
1059
1060 class ExtractorError(YoutubeDLError):
1061 """Error during info extraction."""
1062
1063 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1064 """ tb, if given, is the original traceback (so that it can be printed out).
1065 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1066 """
1067 if sys.exc_info()[0] in network_exceptions:
1068 expected = True
1069
1070 self.orig_msg = str(msg)
1071 self.traceback = tb
1072 self.expected = expected
1073 self.cause = cause
1074 self.video_id = video_id
1075 self.ie = ie
1076 self.exc_info = sys.exc_info() # preserve original exception
1077 if isinstance(self.exc_info[1], ExtractorError):
1078 self.exc_info = self.exc_info[1].exc_info
1079 super().__init__(self.__msg)
1080
1081 @property
1082 def __msg(self):
1083 return ''.join((
1084 format_field(self.ie, None, '[%s] '),
1085 format_field(self.video_id, None, '%s: '),
1086 self.orig_msg,
1087 format_field(self.cause, None, ' (caused by %r)'),
1088 '' if self.expected else bug_reports_message()))
1089
1090 def format_traceback(self):
1091 return join_nonempty(
1092 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1093 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1094 delim='\n') or None
1095
1096 def __setattr__(self, name, value):
1097 super().__setattr__(name, value)
1098 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1099 self.msg = self.__msg or type(self).__name__
1100 self.args = (self.msg, ) # Cannot be property
1101
1102
1103 class UnsupportedError(ExtractorError):
1104 def __init__(self, url):
1105 super().__init__(
1106 'Unsupported URL: %s' % url, expected=True)
1107 self.url = url
1108
1109
1110 class RegexNotFoundError(ExtractorError):
1111 """Error when a regex didn't match"""
1112 pass
1113
1114
1115 class GeoRestrictedError(ExtractorError):
1116 """Geographic restriction Error exception.
1117
1118 This exception may be thrown when a video is not available from your
1119 geographic location due to geographic restrictions imposed by a website.
1120 """
1121
1122 def __init__(self, msg, countries=None, **kwargs):
1123 kwargs['expected'] = True
1124 super().__init__(msg, **kwargs)
1125 self.countries = countries
1126
1127
1128 class UserNotLive(ExtractorError):
1129 """Error when a channel/user is not live"""
1130
1131 def __init__(self, msg=None, **kwargs):
1132 kwargs['expected'] = True
1133 super().__init__(msg or 'The channel is not currently live', **kwargs)
1134
1135
1136 class DownloadError(YoutubeDLError):
1137 """Download Error exception.
1138
1139 This exception may be thrown by FileDownloader objects if they are not
1140 configured to continue on errors. They will contain the appropriate
1141 error message.
1142 """
1143
1144 def __init__(self, msg, exc_info=None):
1145 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1146 super().__init__(msg)
1147 self.exc_info = exc_info
1148
1149
1150 class EntryNotInPlaylist(YoutubeDLError):
1151 """Entry not in playlist exception.
1152
1153 This exception will be thrown by YoutubeDL when a requested entry
1154 is not found in the playlist info_dict
1155 """
1156 msg = 'Entry not found in info'
1157
1158
1159 class SameFileError(YoutubeDLError):
1160 """Same File exception.
1161
1162 This exception will be thrown by FileDownloader objects if they detect
1163 multiple files would have to be downloaded to the same file on disk.
1164 """
1165 msg = 'Fixed output name but more than one file to download'
1166
1167 def __init__(self, filename=None):
1168 if filename is not None:
1169 self.msg += f': {filename}'
1170 super().__init__(self.msg)
1171
1172
1173 class PostProcessingError(YoutubeDLError):
1174 """Post Processing exception.
1175
1176 This exception may be raised by PostProcessor's .run() method to
1177 indicate an error in the postprocessing task.
1178 """
1179
1180
1181 class DownloadCancelled(YoutubeDLError):
1182 """ Exception raised when the download queue should be interrupted """
1183 msg = 'The download was cancelled'
1184
1185
1186 class ExistingVideoReached(DownloadCancelled):
1187 """ --break-on-existing triggered """
1188 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1189
1190
1191 class RejectedVideoReached(DownloadCancelled):
1192 """ --break-match-filter triggered """
1193 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1194
1195
1196 class MaxDownloadsReached(DownloadCancelled):
1197 """ --max-downloads limit has been reached. """
1198 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1199
1200
1201 class ReExtractInfo(YoutubeDLError):
1202 """ Video info needs to be re-extracted. """
1203
1204 def __init__(self, msg, expected=False):
1205 super().__init__(msg)
1206 self.expected = expected
1207
1208
1209 class ThrottledDownload(ReExtractInfo):
1210 """ Download speed below --throttled-rate. """
1211 msg = 'The download speed is below throttle limit'
1212
1213 def __init__(self):
1214 super().__init__(self.msg, expected=False)
1215
1216
1217 class UnavailableVideoError(YoutubeDLError):
1218 """Unavailable Format exception.
1219
1220 This exception will be thrown when a video is requested
1221 in a format that is not available for that video.
1222 """
1223 msg = 'Unable to download video'
1224
1225 def __init__(self, err=None):
1226 if err is not None:
1227 self.msg += f': {err}'
1228 super().__init__(self.msg)
1229
1230
1231 class ContentTooShortError(YoutubeDLError):
1232 """Content Too Short exception.
1233
1234 This exception may be raised by FileDownloader objects when a file they
1235 download is too small for what the server announced first, indicating
1236 the connection was probably interrupted.
1237 """
1238
1239 def __init__(self, downloaded, expected):
1240 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1241 # Both in bytes
1242 self.downloaded = downloaded
1243 self.expected = expected
1244
1245
1246 class XAttrMetadataError(YoutubeDLError):
1247 def __init__(self, code=None, msg='Unknown error'):
1248 super().__init__(msg)
1249 self.code = code
1250 self.msg = msg
1251
1252 # Parsing code and msg
1253 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1254 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1255 self.reason = 'NO_SPACE'
1256 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1257 self.reason = 'VALUE_TOO_LONG'
1258 else:
1259 self.reason = 'NOT_SUPPORTED'
1260
1261
1262 class XAttrUnavailableError(YoutubeDLError):
1263 pass
1264
1265
1266 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1267 hc = http_class(*args, **kwargs)
1268 source_address = ydl_handler._params.get('source_address')
1269
1270 if source_address is not None:
1271 # This is to workaround _create_connection() from socket where it will try all
1272 # address data from getaddrinfo() including IPv6. This filters the result from
1273 # getaddrinfo() based on the source_address value.
1274 # This is based on the cpython socket.create_connection() function.
1275 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1276 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1277 host, port = address
1278 err = None
1279 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1280 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1281 ip_addrs = [addr for addr in addrs if addr[0] == af]
1282 if addrs and not ip_addrs:
1283 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1284 raise OSError(
1285 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1286 % (ip_version, source_address[0]))
1287 for res in ip_addrs:
1288 af, socktype, proto, canonname, sa = res
1289 sock = None
1290 try:
1291 sock = socket.socket(af, socktype, proto)
1292 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1293 sock.settimeout(timeout)
1294 sock.bind(source_address)
1295 sock.connect(sa)
1296 err = None # Explicitly break reference cycle
1297 return sock
1298 except OSError as _:
1299 err = _
1300 if sock is not None:
1301 sock.close()
1302 if err is not None:
1303 raise err
1304 else:
1305 raise OSError('getaddrinfo returns an empty list')
1306 if hasattr(hc, '_create_connection'):
1307 hc._create_connection = _create_connection
1308 hc.source_address = (source_address, 0)
1309
1310 return hc
1311
1312
1313 class YoutubeDLHandler(urllib.request.HTTPHandler):
1314 """Handler for HTTP requests and responses.
1315
1316 This class, when installed with an OpenerDirector, automatically adds
1317 the standard headers to every HTTP request and handles gzipped, deflated and
1318 brotli responses from web servers.
1319
1320 Part of this code was copied from:
1321
1322 http://techknack.net/python-urllib2-handlers/
1323
1324 Andrew Rowls, the author of that code, agreed to release it to the
1325 public domain.
1326 """
1327
1328 def __init__(self, params, *args, **kwargs):
1329 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1330 self._params = params
1331
1332 def http_open(self, req):
1333 conn_class = http.client.HTTPConnection
1334
1335 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1336 if socks_proxy:
1337 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1338 del req.headers['Ytdl-socks-proxy']
1339
1340 return self.do_open(functools.partial(
1341 _create_http_connection, self, conn_class, False),
1342 req)
1343
1344 @staticmethod
1345 def deflate(data):
1346 if not data:
1347 return data
1348 try:
1349 return zlib.decompress(data, -zlib.MAX_WBITS)
1350 except zlib.error:
1351 return zlib.decompress(data)
1352
1353 @staticmethod
1354 def brotli(data):
1355 if not data:
1356 return data
1357 return brotli.decompress(data)
1358
1359 def http_request(self, req):
1360 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1361 # always respected by websites, some tend to give out URLs with non percent-encoded
1362 # non-ASCII characters (see telemb.py, ard.py [#3412])
1363 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1364 # To work around aforementioned issue we will replace request's original URL with
1365 # percent-encoded one
1366 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1367 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1368 url = req.get_full_url()
1369 url_escaped = escape_url(url)
1370
1371 # Substitute URL if any change after escaping
1372 if url != url_escaped:
1373 req = update_Request(req, url=url_escaped)
1374
1375 for h, v in self._params.get('http_headers', std_headers).items():
1376 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1377 # The dict keys are capitalized because of this bug by urllib
1378 if h.capitalize() not in req.headers:
1379 req.add_header(h, v)
1380
1381 if 'Youtubedl-no-compression' in req.headers: # deprecated
1382 req.headers.pop('Youtubedl-no-compression', None)
1383 req.add_header('Accept-encoding', 'identity')
1384
1385 if 'Accept-encoding' not in req.headers:
1386 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1387
1388 return super().do_request_(req)
1389
1390 def http_response(self, req, resp):
1391 old_resp = resp
1392 # gzip
1393 if resp.headers.get('Content-encoding', '') == 'gzip':
1394 content = resp.read()
1395 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1396 try:
1397 uncompressed = io.BytesIO(gz.read())
1398 except OSError as original_ioerror:
1399 # There may be junk add the end of the file
1400 # See http://stackoverflow.com/q/4928560/35070 for details
1401 for i in range(1, 1024):
1402 try:
1403 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1404 uncompressed = io.BytesIO(gz.read())
1405 except OSError:
1406 continue
1407 break
1408 else:
1409 raise original_ioerror
1410 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1411 resp.msg = old_resp.msg
1412 # deflate
1413 if resp.headers.get('Content-encoding', '') == 'deflate':
1414 gz = io.BytesIO(self.deflate(resp.read()))
1415 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1416 resp.msg = old_resp.msg
1417 # brotli
1418 if resp.headers.get('Content-encoding', '') == 'br':
1419 resp = urllib.request.addinfourl(
1420 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1421 resp.msg = old_resp.msg
1422 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1423 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1424 if 300 <= resp.code < 400:
1425 location = resp.headers.get('Location')
1426 if location:
1427 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1428 location = location.encode('iso-8859-1').decode()
1429 location_escaped = escape_url(location)
1430 if location != location_escaped:
1431 del resp.headers['Location']
1432 resp.headers['Location'] = location_escaped
1433 return resp
1434
1435 https_request = http_request
1436 https_response = http_response
1437
1438
1439 def make_socks_conn_class(base_class, socks_proxy):
1440 assert issubclass(base_class, (
1441 http.client.HTTPConnection, http.client.HTTPSConnection))
1442
1443 url_components = urllib.parse.urlparse(socks_proxy)
1444 if url_components.scheme.lower() == 'socks5':
1445 socks_type = ProxyType.SOCKS5
1446 elif url_components.scheme.lower() in ('socks', 'socks4'):
1447 socks_type = ProxyType.SOCKS4
1448 elif url_components.scheme.lower() == 'socks4a':
1449 socks_type = ProxyType.SOCKS4A
1450
1451 def unquote_if_non_empty(s):
1452 if not s:
1453 return s
1454 return urllib.parse.unquote_plus(s)
1455
1456 proxy_args = (
1457 socks_type,
1458 url_components.hostname, url_components.port or 1080,
1459 True, # Remote DNS
1460 unquote_if_non_empty(url_components.username),
1461 unquote_if_non_empty(url_components.password),
1462 )
1463
1464 class SocksConnection(base_class):
1465 def connect(self):
1466 self.sock = sockssocket()
1467 self.sock.setproxy(*proxy_args)
1468 if isinstance(self.timeout, (int, float)):
1469 self.sock.settimeout(self.timeout)
1470 self.sock.connect((self.host, self.port))
1471
1472 if isinstance(self, http.client.HTTPSConnection):
1473 if hasattr(self, '_context'): # Python > 2.6
1474 self.sock = self._context.wrap_socket(
1475 self.sock, server_hostname=self.host)
1476 else:
1477 self.sock = ssl.wrap_socket(self.sock)
1478
1479 return SocksConnection
1480
1481
1482 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1483 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1484 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1485 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1486 self._params = params
1487
1488 def https_open(self, req):
1489 kwargs = {}
1490 conn_class = self._https_conn_class
1491
1492 if hasattr(self, '_context'): # python > 2.6
1493 kwargs['context'] = self._context
1494 if hasattr(self, '_check_hostname'): # python 3.x
1495 kwargs['check_hostname'] = self._check_hostname
1496
1497 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1498 if socks_proxy:
1499 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1500 del req.headers['Ytdl-socks-proxy']
1501
1502 try:
1503 return self.do_open(
1504 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1505 except urllib.error.URLError as e:
1506 if (isinstance(e.reason, ssl.SSLError)
1507 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1508 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1509 raise
1510
1511
1512 def is_path_like(f):
1513 return isinstance(f, (str, bytes, os.PathLike))
1514
1515
1516 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1517 """
1518 See [1] for cookie file format.
1519
1520 1. https://curl.haxx.se/docs/http-cookies.html
1521 """
1522 _HTTPONLY_PREFIX = '#HttpOnly_'
1523 _ENTRY_LEN = 7
1524 _HEADER = '''# Netscape HTTP Cookie File
1525 # This file is generated by yt-dlp. Do not edit.
1526
1527 '''
1528 _CookieFileEntry = collections.namedtuple(
1529 'CookieFileEntry',
1530 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1531
1532 def __init__(self, filename=None, *args, **kwargs):
1533 super().__init__(None, *args, **kwargs)
1534 if is_path_like(filename):
1535 filename = os.fspath(filename)
1536 self.filename = filename
1537
1538 @staticmethod
1539 def _true_or_false(cndn):
1540 return 'TRUE' if cndn else 'FALSE'
1541
1542 @contextlib.contextmanager
1543 def open(self, file, *, write=False):
1544 if is_path_like(file):
1545 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1546 yield f
1547 else:
1548 if write:
1549 file.truncate(0)
1550 yield file
1551
1552 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1553 now = time.time()
1554 for cookie in self:
1555 if (not ignore_discard and cookie.discard
1556 or not ignore_expires and cookie.is_expired(now)):
1557 continue
1558 name, value = cookie.name, cookie.value
1559 if value is None:
1560 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1561 # with no name, whereas http.cookiejar regards it as a
1562 # cookie with no value.
1563 name, value = '', name
1564 f.write('%s\n' % '\t'.join((
1565 cookie.domain,
1566 self._true_or_false(cookie.domain.startswith('.')),
1567 cookie.path,
1568 self._true_or_false(cookie.secure),
1569 str_or_none(cookie.expires, default=''),
1570 name, value
1571 )))
1572
1573 def save(self, filename=None, *args, **kwargs):
1574 """
1575 Save cookies to a file.
1576 Code is taken from CPython 3.6
1577 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1578
1579 if filename is None:
1580 if self.filename is not None:
1581 filename = self.filename
1582 else:
1583 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1584
1585 # Store session cookies with `expires` set to 0 instead of an empty string
1586 for cookie in self:
1587 if cookie.expires is None:
1588 cookie.expires = 0
1589
1590 with self.open(filename, write=True) as f:
1591 f.write(self._HEADER)
1592 self._really_save(f, *args, **kwargs)
1593
1594 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1595 """Load cookies from a file."""
1596 if filename is None:
1597 if self.filename is not None:
1598 filename = self.filename
1599 else:
1600 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1601
1602 def prepare_line(line):
1603 if line.startswith(self._HTTPONLY_PREFIX):
1604 line = line[len(self._HTTPONLY_PREFIX):]
1605 # comments and empty lines are fine
1606 if line.startswith('#') or not line.strip():
1607 return line
1608 cookie_list = line.split('\t')
1609 if len(cookie_list) != self._ENTRY_LEN:
1610 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1611 cookie = self._CookieFileEntry(*cookie_list)
1612 if cookie.expires_at and not cookie.expires_at.isdigit():
1613 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1614 return line
1615
1616 cf = io.StringIO()
1617 with self.open(filename) as f:
1618 for line in f:
1619 try:
1620 cf.write(prepare_line(line))
1621 except http.cookiejar.LoadError as e:
1622 if f'{line.strip()} '[0] in '[{"':
1623 raise http.cookiejar.LoadError(
1624 'Cookies file must be Netscape formatted, not JSON. See '
1625 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1626 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1627 continue
1628 cf.seek(0)
1629 self._really_load(cf, filename, ignore_discard, ignore_expires)
1630 # Session cookies are denoted by either `expires` field set to
1631 # an empty string or 0. MozillaCookieJar only recognizes the former
1632 # (see [1]). So we need force the latter to be recognized as session
1633 # cookies on our own.
1634 # Session cookies may be important for cookies-based authentication,
1635 # e.g. usually, when user does not check 'Remember me' check box while
1636 # logging in on a site, some important cookies are stored as session
1637 # cookies so that not recognizing them will result in failed login.
1638 # 1. https://bugs.python.org/issue17164
1639 for cookie in self:
1640 # Treat `expires=0` cookies as session cookies
1641 if cookie.expires == 0:
1642 cookie.expires = None
1643 cookie.discard = True
1644
1645
1646 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1647 def __init__(self, cookiejar=None):
1648 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1649
1650 def http_response(self, request, response):
1651 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1652
1653 https_request = urllib.request.HTTPCookieProcessor.http_request
1654 https_response = http_response
1655
1656
1657 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1658 """YoutubeDL redirect handler
1659
1660 The code is based on HTTPRedirectHandler implementation from CPython [1].
1661
1662 This redirect handler solves two issues:
1663 - ensures redirect URL is always unicode under python 2
1664 - introduces support for experimental HTTP response status code
1665 308 Permanent Redirect [2] used by some sites [3]
1666
1667 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1668 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1669 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1670 """
1671
1672 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1673
1674 def redirect_request(self, req, fp, code, msg, headers, newurl):
1675 """Return a Request or None in response to a redirect.
1676
1677 This is called by the http_error_30x methods when a
1678 redirection response is received. If a redirection should
1679 take place, return a new Request to allow http_error_30x to
1680 perform the redirect. Otherwise, raise HTTPError if no-one
1681 else should try to handle this url. Return None if you can't
1682 but another Handler might.
1683 """
1684 m = req.get_method()
1685 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1686 or code in (301, 302, 303) and m == "POST")):
1687 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1688 # Strictly (according to RFC 2616), 301 or 302 in response to
1689 # a POST MUST NOT cause a redirection without confirmation
1690 # from the user (of urllib.request, in this case). In practice,
1691 # essentially all clients do redirect in this case, so we do
1692 # the same.
1693
1694 # Be conciliant with URIs containing a space. This is mainly
1695 # redundant with the more complete encoding done in http_error_302(),
1696 # but it is kept for compatibility with other callers.
1697 newurl = newurl.replace(' ', '%20')
1698
1699 CONTENT_HEADERS = ("content-length", "content-type")
1700 # NB: don't use dict comprehension for python 2.6 compatibility
1701 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1702
1703 # A 303 must either use GET or HEAD for subsequent request
1704 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1705 if code == 303 and m != 'HEAD':
1706 m = 'GET'
1707 # 301 and 302 redirects are commonly turned into a GET from a POST
1708 # for subsequent requests by browsers, so we'll do the same.
1709 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1710 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1711 if code in (301, 302) and m == 'POST':
1712 m = 'GET'
1713
1714 return urllib.request.Request(
1715 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1716 unverifiable=True, method=m)
1717
1718
1719 def extract_timezone(date_str):
1720 m = re.search(
1721 r'''(?x)
1722 ^.{8,}? # >=8 char non-TZ prefix, if present
1723 (?P<tz>Z| # just the UTC Z, or
1724 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1725 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1726 [ ]? # optional space
1727 (?P<sign>\+|-) # +/-
1728 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1729 $)
1730 ''', date_str)
1731 if not m:
1732 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1733 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1734 if timezone is not None:
1735 date_str = date_str[:-len(m.group('tz'))]
1736 timezone = datetime.timedelta(hours=timezone or 0)
1737 else:
1738 date_str = date_str[:-len(m.group('tz'))]
1739 if not m.group('sign'):
1740 timezone = datetime.timedelta()
1741 else:
1742 sign = 1 if m.group('sign') == '+' else -1
1743 timezone = datetime.timedelta(
1744 hours=sign * int(m.group('hours')),
1745 minutes=sign * int(m.group('minutes')))
1746 return timezone, date_str
1747
1748
1749 def parse_iso8601(date_str, delimiter='T', timezone=None):
1750 """ Return a UNIX timestamp from the given date """
1751
1752 if date_str is None:
1753 return None
1754
1755 date_str = re.sub(r'\.[0-9]+', '', date_str)
1756
1757 if timezone is None:
1758 timezone, date_str = extract_timezone(date_str)
1759
1760 with contextlib.suppress(ValueError):
1761 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1762 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1763 return calendar.timegm(dt.timetuple())
1764
1765
1766 def date_formats(day_first=True):
1767 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1768
1769
1770 def unified_strdate(date_str, day_first=True):
1771 """Return a string with the date in the format YYYYMMDD"""
1772
1773 if date_str is None:
1774 return None
1775 upload_date = None
1776 # Replace commas
1777 date_str = date_str.replace(',', ' ')
1778 # Remove AM/PM + timezone
1779 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1780 _, date_str = extract_timezone(date_str)
1781
1782 for expression in date_formats(day_first):
1783 with contextlib.suppress(ValueError):
1784 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1785 if upload_date is None:
1786 timetuple = email.utils.parsedate_tz(date_str)
1787 if timetuple:
1788 with contextlib.suppress(ValueError):
1789 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1790 if upload_date is not None:
1791 return str(upload_date)
1792
1793
1794 def unified_timestamp(date_str, day_first=True):
1795 if date_str is None:
1796 return None
1797
1798 date_str = re.sub(r'\s+', ' ', re.sub(
1799 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1800
1801 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1802 timezone, date_str = extract_timezone(date_str)
1803
1804 # Remove AM/PM + timezone
1805 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1806
1807 # Remove unrecognized timezones from ISO 8601 alike timestamps
1808 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1809 if m:
1810 date_str = date_str[:-len(m.group('tz'))]
1811
1812 # Python only supports microseconds, so remove nanoseconds
1813 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1814 if m:
1815 date_str = m.group(1)
1816
1817 for expression in date_formats(day_first):
1818 with contextlib.suppress(ValueError):
1819 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1820 return calendar.timegm(dt.timetuple())
1821
1822 timetuple = email.utils.parsedate_tz(date_str)
1823 if timetuple:
1824 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1825
1826
1827 def determine_ext(url, default_ext='unknown_video'):
1828 if url is None or '.' not in url:
1829 return default_ext
1830 guess = url.partition('?')[0].rpartition('.')[2]
1831 if re.match(r'^[A-Za-z0-9]+$', guess):
1832 return guess
1833 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1834 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1835 return guess.rstrip('/')
1836 else:
1837 return default_ext
1838
1839
1840 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1841 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1842
1843
1844 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1845 R"""
1846 Return a datetime object from a string.
1847 Supported format:
1848 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1849
1850 @param format strftime format of DATE
1851 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1852 auto: round to the unit provided in date_str (if applicable).
1853 """
1854 auto_precision = False
1855 if precision == 'auto':
1856 auto_precision = True
1857 precision = 'microsecond'
1858 today = datetime_round(datetime.datetime.utcnow(), precision)
1859 if date_str in ('now', 'today'):
1860 return today
1861 if date_str == 'yesterday':
1862 return today - datetime.timedelta(days=1)
1863 match = re.match(
1864 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1865 date_str)
1866 if match is not None:
1867 start_time = datetime_from_str(match.group('start'), precision, format)
1868 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1869 unit = match.group('unit')
1870 if unit == 'month' or unit == 'year':
1871 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1872 unit = 'day'
1873 else:
1874 if unit == 'week':
1875 unit = 'day'
1876 time *= 7
1877 delta = datetime.timedelta(**{unit + 's': time})
1878 new_date = start_time + delta
1879 if auto_precision:
1880 return datetime_round(new_date, unit)
1881 return new_date
1882
1883 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1884
1885
1886 def date_from_str(date_str, format='%Y%m%d', strict=False):
1887 R"""
1888 Return a date object from a string using datetime_from_str
1889
1890 @param strict Restrict allowed patterns to "YYYYMMDD" and
1891 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1892 """
1893 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1894 raise ValueError(f'Invalid date format "{date_str}"')
1895 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1896
1897
1898 def datetime_add_months(dt, months):
1899 """Increment/Decrement a datetime object by months."""
1900 month = dt.month + months - 1
1901 year = dt.year + month // 12
1902 month = month % 12 + 1
1903 day = min(dt.day, calendar.monthrange(year, month)[1])
1904 return dt.replace(year, month, day)
1905
1906
1907 def datetime_round(dt, precision='day'):
1908 """
1909 Round a datetime object's time to a specific precision
1910 """
1911 if precision == 'microsecond':
1912 return dt
1913
1914 unit_seconds = {
1915 'day': 86400,
1916 'hour': 3600,
1917 'minute': 60,
1918 'second': 1,
1919 }
1920 roundto = lambda x, n: ((x + n / 2) // n) * n
1921 timestamp = calendar.timegm(dt.timetuple())
1922 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1923
1924
1925 def hyphenate_date(date_str):
1926 """
1927 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1928 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1929 if match is not None:
1930 return '-'.join(match.groups())
1931 else:
1932 return date_str
1933
1934
1935 class DateRange:
1936 """Represents a time interval between two dates"""
1937
1938 def __init__(self, start=None, end=None):
1939 """start and end must be strings in the format accepted by date"""
1940 if start is not None:
1941 self.start = date_from_str(start, strict=True)
1942 else:
1943 self.start = datetime.datetime.min.date()
1944 if end is not None:
1945 self.end = date_from_str(end, strict=True)
1946 else:
1947 self.end = datetime.datetime.max.date()
1948 if self.start > self.end:
1949 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1950
1951 @classmethod
1952 def day(cls, day):
1953 """Returns a range that only contains the given day"""
1954 return cls(day, day)
1955
1956 def __contains__(self, date):
1957 """Check if the date is in the range"""
1958 if not isinstance(date, datetime.date):
1959 date = date_from_str(date)
1960 return self.start <= date <= self.end
1961
1962 def __repr__(self):
1963 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1964
1965 def __eq__(self, other):
1966 return (isinstance(other, DateRange)
1967 and self.start == other.start and self.end == other.end)
1968
1969
1970 @functools.cache
1971 def system_identifier():
1972 python_implementation = platform.python_implementation()
1973 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1974 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1975 libc_ver = []
1976 with contextlib.suppress(OSError): # We may not have access to the executable
1977 libc_ver = platform.libc_ver()
1978
1979 return 'Python %s (%s %s %s) - %s (%s%s)' % (
1980 platform.python_version(),
1981 python_implementation,
1982 platform.machine(),
1983 platform.architecture()[0],
1984 platform.platform(),
1985 ssl.OPENSSL_VERSION,
1986 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1987 )
1988
1989
1990 @functools.cache
1991 def get_windows_version():
1992 ''' Get Windows version. returns () if it's not running on Windows '''
1993 if compat_os_name == 'nt':
1994 return version_tuple(platform.win32_ver()[1])
1995 else:
1996 return ()
1997
1998
1999 def write_string(s, out=None, encoding=None):
2000 assert isinstance(s, str)
2001 out = out or sys.stderr
2002 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
2003 if not out:
2004 return
2005
2006 if compat_os_name == 'nt' and supports_terminal_sequences(out):
2007 s = re.sub(r'([\r\n]+)', r' \1', s)
2008
2009 enc, buffer = None, out
2010 if 'b' in getattr(out, 'mode', ''):
2011 enc = encoding or preferredencoding()
2012 elif hasattr(out, 'buffer'):
2013 buffer = out.buffer
2014 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2015
2016 buffer.write(s.encode(enc, 'ignore') if enc else s)
2017 out.flush()
2018
2019
2020 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2021 from .. import _IN_CLI
2022 if _IN_CLI:
2023 if msg in deprecation_warning._cache:
2024 return
2025 deprecation_warning._cache.add(msg)
2026 if printer:
2027 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2028 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2029 else:
2030 import warnings
2031 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2032
2033
2034 deprecation_warning._cache = set()
2035
2036
2037 def bytes_to_intlist(bs):
2038 if not bs:
2039 return []
2040 if isinstance(bs[0], int): # Python 3
2041 return list(bs)
2042 else:
2043 return [ord(c) for c in bs]
2044
2045
2046 def intlist_to_bytes(xs):
2047 if not xs:
2048 return b''
2049 return struct.pack('%dB' % len(xs), *xs)
2050
2051
2052 class LockingUnsupportedError(OSError):
2053 msg = 'File locking is not supported'
2054
2055 def __init__(self):
2056 super().__init__(self.msg)
2057
2058
2059 # Cross-platform file locking
2060 if sys.platform == 'win32':
2061 import ctypes
2062 import ctypes.wintypes
2063 import msvcrt
2064
2065 class OVERLAPPED(ctypes.Structure):
2066 _fields_ = [
2067 ('Internal', ctypes.wintypes.LPVOID),
2068 ('InternalHigh', ctypes.wintypes.LPVOID),
2069 ('Offset', ctypes.wintypes.DWORD),
2070 ('OffsetHigh', ctypes.wintypes.DWORD),
2071 ('hEvent', ctypes.wintypes.HANDLE),
2072 ]
2073
2074 kernel32 = ctypes.WinDLL('kernel32')
2075 LockFileEx = kernel32.LockFileEx
2076 LockFileEx.argtypes = [
2077 ctypes.wintypes.HANDLE, # hFile
2078 ctypes.wintypes.DWORD, # dwFlags
2079 ctypes.wintypes.DWORD, # dwReserved
2080 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2081 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2082 ctypes.POINTER(OVERLAPPED) # Overlapped
2083 ]
2084 LockFileEx.restype = ctypes.wintypes.BOOL
2085 UnlockFileEx = kernel32.UnlockFileEx
2086 UnlockFileEx.argtypes = [
2087 ctypes.wintypes.HANDLE, # hFile
2088 ctypes.wintypes.DWORD, # dwReserved
2089 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2090 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2091 ctypes.POINTER(OVERLAPPED) # Overlapped
2092 ]
2093 UnlockFileEx.restype = ctypes.wintypes.BOOL
2094 whole_low = 0xffffffff
2095 whole_high = 0x7fffffff
2096
2097 def _lock_file(f, exclusive, block):
2098 overlapped = OVERLAPPED()
2099 overlapped.Offset = 0
2100 overlapped.OffsetHigh = 0
2101 overlapped.hEvent = 0
2102 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2103
2104 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2105 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2106 0, whole_low, whole_high, f._lock_file_overlapped_p):
2107 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2108 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2109
2110 def _unlock_file(f):
2111 assert f._lock_file_overlapped_p
2112 handle = msvcrt.get_osfhandle(f.fileno())
2113 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2114 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2115
2116 else:
2117 try:
2118 import fcntl
2119
2120 def _lock_file(f, exclusive, block):
2121 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2122 if not block:
2123 flags |= fcntl.LOCK_NB
2124 try:
2125 fcntl.flock(f, flags)
2126 except BlockingIOError:
2127 raise
2128 except OSError: # AOSP does not have flock()
2129 fcntl.lockf(f, flags)
2130
2131 def _unlock_file(f):
2132 with contextlib.suppress(OSError):
2133 return fcntl.flock(f, fcntl.LOCK_UN)
2134 with contextlib.suppress(OSError):
2135 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
2136 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
2137
2138 except ImportError:
2139
2140 def _lock_file(f, exclusive, block):
2141 raise LockingUnsupportedError()
2142
2143 def _unlock_file(f):
2144 raise LockingUnsupportedError()
2145
2146
2147 class locked_file:
2148 locked = False
2149
2150 def __init__(self, filename, mode, block=True, encoding=None):
2151 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2152 raise NotImplementedError(mode)
2153 self.mode, self.block = mode, block
2154
2155 writable = any(f in mode for f in 'wax+')
2156 readable = any(f in mode for f in 'r+')
2157 flags = functools.reduce(operator.ior, (
2158 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2159 getattr(os, 'O_BINARY', 0), # Windows only
2160 getattr(os, 'O_NOINHERIT', 0), # Windows only
2161 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2162 os.O_APPEND if 'a' in mode else 0,
2163 os.O_EXCL if 'x' in mode else 0,
2164 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2165 ))
2166
2167 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2168
2169 def __enter__(self):
2170 exclusive = 'r' not in self.mode
2171 try:
2172 _lock_file(self.f, exclusive, self.block)
2173 self.locked = True
2174 except OSError:
2175 self.f.close()
2176 raise
2177 if 'w' in self.mode:
2178 try:
2179 self.f.truncate()
2180 except OSError as e:
2181 if e.errno not in (
2182 errno.ESPIPE, # Illegal seek - expected for FIFO
2183 errno.EINVAL, # Invalid argument - expected for /dev/null
2184 ):
2185 raise
2186 return self
2187
2188 def unlock(self):
2189 if not self.locked:
2190 return
2191 try:
2192 _unlock_file(self.f)
2193 finally:
2194 self.locked = False
2195
2196 def __exit__(self, *_):
2197 try:
2198 self.unlock()
2199 finally:
2200 self.f.close()
2201
2202 open = __enter__
2203 close = __exit__
2204
2205 def __getattr__(self, attr):
2206 return getattr(self.f, attr)
2207
2208 def __iter__(self):
2209 return iter(self.f)
2210
2211
2212 @functools.cache
2213 def get_filesystem_encoding():
2214 encoding = sys.getfilesystemencoding()
2215 return encoding if encoding is not None else 'utf-8'
2216
2217
2218 def shell_quote(args):
2219 quoted_args = []
2220 encoding = get_filesystem_encoding()
2221 for a in args:
2222 if isinstance(a, bytes):
2223 # We may get a filename encoded with 'encodeFilename'
2224 a = a.decode(encoding)
2225 quoted_args.append(compat_shlex_quote(a))
2226 return ' '.join(quoted_args)
2227
2228
2229 def smuggle_url(url, data):
2230 """ Pass additional data in a URL for internal use. """
2231
2232 url, idata = unsmuggle_url(url, {})
2233 data.update(idata)
2234 sdata = urllib.parse.urlencode(
2235 {'__youtubedl_smuggle': json.dumps(data)})
2236 return url + '#' + sdata
2237
2238
2239 def unsmuggle_url(smug_url, default=None):
2240 if '#__youtubedl_smuggle' not in smug_url:
2241 return smug_url, default
2242 url, _, sdata = smug_url.rpartition('#')
2243 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2244 data = json.loads(jsond)
2245 return url, data
2246
2247
2248 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2249 """ Formats numbers with decimal sufixes like K, M, etc """
2250 num, factor = float_or_none(num), float(factor)
2251 if num is None or num < 0:
2252 return None
2253 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2254 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2255 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2256 if factor == 1024:
2257 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2258 converted = num / (factor ** exponent)
2259 return fmt % (converted, suffix)
2260
2261
2262 def format_bytes(bytes):
2263 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2264
2265
2266 def lookup_unit_table(unit_table, s, strict=False):
2267 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2268 units_re = '|'.join(re.escape(u) for u in unit_table)
2269 m = (re.fullmatch if strict else re.match)(
2270 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2271 if not m:
2272 return None
2273
2274 num = float(m.group('num').replace(',', '.'))
2275 mult = unit_table[m.group('unit')]
2276 return round(num * mult)
2277
2278
2279 def parse_bytes(s):
2280 """Parse a string indicating a byte quantity into an integer"""
2281 return lookup_unit_table(
2282 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2283 s.upper(), strict=True)
2284
2285
2286 def parse_filesize(s):
2287 if s is None:
2288 return None
2289
2290 # The lower-case forms are of course incorrect and unofficial,
2291 # but we support those too
2292 _UNIT_TABLE = {
2293 'B': 1,
2294 'b': 1,
2295 'bytes': 1,
2296 'KiB': 1024,
2297 'KB': 1000,
2298 'kB': 1024,
2299 'Kb': 1000,
2300 'kb': 1000,
2301 'kilobytes': 1000,
2302 'kibibytes': 1024,
2303 'MiB': 1024 ** 2,
2304 'MB': 1000 ** 2,
2305 'mB': 1024 ** 2,
2306 'Mb': 1000 ** 2,
2307 'mb': 1000 ** 2,
2308 'megabytes': 1000 ** 2,
2309 'mebibytes': 1024 ** 2,
2310 'GiB': 1024 ** 3,
2311 'GB': 1000 ** 3,
2312 'gB': 1024 ** 3,
2313 'Gb': 1000 ** 3,
2314 'gb': 1000 ** 3,
2315 'gigabytes': 1000 ** 3,
2316 'gibibytes': 1024 ** 3,
2317 'TiB': 1024 ** 4,
2318 'TB': 1000 ** 4,
2319 'tB': 1024 ** 4,
2320 'Tb': 1000 ** 4,
2321 'tb': 1000 ** 4,
2322 'terabytes': 1000 ** 4,
2323 'tebibytes': 1024 ** 4,
2324 'PiB': 1024 ** 5,
2325 'PB': 1000 ** 5,
2326 'pB': 1024 ** 5,
2327 'Pb': 1000 ** 5,
2328 'pb': 1000 ** 5,
2329 'petabytes': 1000 ** 5,
2330 'pebibytes': 1024 ** 5,
2331 'EiB': 1024 ** 6,
2332 'EB': 1000 ** 6,
2333 'eB': 1024 ** 6,
2334 'Eb': 1000 ** 6,
2335 'eb': 1000 ** 6,
2336 'exabytes': 1000 ** 6,
2337 'exbibytes': 1024 ** 6,
2338 'ZiB': 1024 ** 7,
2339 'ZB': 1000 ** 7,
2340 'zB': 1024 ** 7,
2341 'Zb': 1000 ** 7,
2342 'zb': 1000 ** 7,
2343 'zettabytes': 1000 ** 7,
2344 'zebibytes': 1024 ** 7,
2345 'YiB': 1024 ** 8,
2346 'YB': 1000 ** 8,
2347 'yB': 1024 ** 8,
2348 'Yb': 1000 ** 8,
2349 'yb': 1000 ** 8,
2350 'yottabytes': 1000 ** 8,
2351 'yobibytes': 1024 ** 8,
2352 }
2353
2354 return lookup_unit_table(_UNIT_TABLE, s)
2355
2356
2357 def parse_count(s):
2358 if s is None:
2359 return None
2360
2361 s = re.sub(r'^[^\d]+\s', '', s).strip()
2362
2363 if re.match(r'^[\d,.]+$', s):
2364 return str_to_int(s)
2365
2366 _UNIT_TABLE = {
2367 'k': 1000,
2368 'K': 1000,
2369 'm': 1000 ** 2,
2370 'M': 1000 ** 2,
2371 'kk': 1000 ** 2,
2372 'KK': 1000 ** 2,
2373 'b': 1000 ** 3,
2374 'B': 1000 ** 3,
2375 }
2376
2377 ret = lookup_unit_table(_UNIT_TABLE, s)
2378 if ret is not None:
2379 return ret
2380
2381 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2382 if mobj:
2383 return str_to_int(mobj.group(1))
2384
2385
2386 def parse_resolution(s, *, lenient=False):
2387 if s is None:
2388 return {}
2389
2390 if lenient:
2391 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2392 else:
2393 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2394 if mobj:
2395 return {
2396 'width': int(mobj.group('w')),
2397 'height': int(mobj.group('h')),
2398 }
2399
2400 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2401 if mobj:
2402 return {'height': int(mobj.group(1))}
2403
2404 mobj = re.search(r'\b([48])[kK]\b', s)
2405 if mobj:
2406 return {'height': int(mobj.group(1)) * 540}
2407
2408 return {}
2409
2410
2411 def parse_bitrate(s):
2412 if not isinstance(s, str):
2413 return
2414 mobj = re.search(r'\b(\d+)\s*kbps', s)
2415 if mobj:
2416 return int(mobj.group(1))
2417
2418
2419 def month_by_name(name, lang='en'):
2420 """ Return the number of a month by (locale-independently) English name """
2421
2422 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2423
2424 try:
2425 return month_names.index(name) + 1
2426 except ValueError:
2427 return None
2428
2429
2430 def month_by_abbreviation(abbrev):
2431 """ Return the number of a month by (locale-independently) English
2432 abbreviations """
2433
2434 try:
2435 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2436 except ValueError:
2437 return None
2438
2439
2440 def fix_xml_ampersands(xml_str):
2441 """Replace all the '&' by '&amp;' in XML"""
2442 return re.sub(
2443 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2444 '&amp;',
2445 xml_str)
2446
2447
2448 def setproctitle(title):
2449 assert isinstance(title, str)
2450
2451 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2452 try:
2453 import ctypes
2454 except ImportError:
2455 return
2456
2457 try:
2458 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2459 except OSError:
2460 return
2461 except TypeError:
2462 # LoadLibrary in Windows Python 2.7.13 only expects
2463 # a bytestring, but since unicode_literals turns
2464 # every string into a unicode string, it fails.
2465 return
2466 title_bytes = title.encode()
2467 buf = ctypes.create_string_buffer(len(title_bytes))
2468 buf.value = title_bytes
2469 try:
2470 libc.prctl(15, buf, 0, 0, 0)
2471 except AttributeError:
2472 return # Strange libc, just skip this
2473
2474
2475 def remove_start(s, start):
2476 return s[len(start):] if s is not None and s.startswith(start) else s
2477
2478
2479 def remove_end(s, end):
2480 return s[:-len(end)] if s is not None and s.endswith(end) else s
2481
2482
2483 def remove_quotes(s):
2484 if s is None or len(s) < 2:
2485 return s
2486 for quote in ('"', "'", ):
2487 if s[0] == quote and s[-1] == quote:
2488 return s[1:-1]
2489 return s
2490
2491
2492 def get_domain(url):
2493 """
2494 This implementation is inconsistent, but is kept for compatibility.
2495 Use this only for "webpage_url_domain"
2496 """
2497 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2498
2499
2500 def url_basename(url):
2501 path = urllib.parse.urlparse(url).path
2502 return path.strip('/').split('/')[-1]
2503
2504
2505 def base_url(url):
2506 return re.match(r'https?://[^?#]+/', url).group()
2507
2508
2509 def urljoin(base, path):
2510 if isinstance(path, bytes):
2511 path = path.decode()
2512 if not isinstance(path, str) or not path:
2513 return None
2514 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2515 return path
2516 if isinstance(base, bytes):
2517 base = base.decode()
2518 if not isinstance(base, str) or not re.match(
2519 r'^(?:https?:)?//', base):
2520 return None
2521 return urllib.parse.urljoin(base, path)
2522
2523
2524 class HEADRequest(urllib.request.Request):
2525 def get_method(self):
2526 return 'HEAD'
2527
2528
2529 class PUTRequest(urllib.request.Request):
2530 def get_method(self):
2531 return 'PUT'
2532
2533
2534 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2535 if get_attr and v is not None:
2536 v = getattr(v, get_attr, None)
2537 try:
2538 return int(v) * invscale // scale
2539 except (ValueError, TypeError, OverflowError):
2540 return default
2541
2542
2543 def str_or_none(v, default=None):
2544 return default if v is None else str(v)
2545
2546
2547 def str_to_int(int_str):
2548 """ A more relaxed version of int_or_none """
2549 if isinstance(int_str, int):
2550 return int_str
2551 elif isinstance(int_str, str):
2552 int_str = re.sub(r'[,\.\+]', '', int_str)
2553 return int_or_none(int_str)
2554
2555
2556 def float_or_none(v, scale=1, invscale=1, default=None):
2557 if v is None:
2558 return default
2559 try:
2560 return float(v) * invscale / scale
2561 except (ValueError, TypeError):
2562 return default
2563
2564
2565 def bool_or_none(v, default=None):
2566 return v if isinstance(v, bool) else default
2567
2568
2569 def strip_or_none(v, default=None):
2570 return v.strip() if isinstance(v, str) else default
2571
2572
2573 def url_or_none(url):
2574 if not url or not isinstance(url, str):
2575 return None
2576 url = url.strip()
2577 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2578
2579
2580 def request_to_url(req):
2581 if isinstance(req, urllib.request.Request):
2582 return req.get_full_url()
2583 else:
2584 return req
2585
2586
2587 def strftime_or_none(timestamp, date_format, default=None):
2588 datetime_object = None
2589 try:
2590 if isinstance(timestamp, (int, float)): # unix timestamp
2591 # Using naive datetime here can break timestamp() in Windows
2592 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2593 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2594 elif isinstance(timestamp, str): # assume YYYYMMDD
2595 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2596 date_format = re.sub( # Support %s on windows
2597 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2598 return datetime_object.strftime(date_format)
2599 except (ValueError, TypeError, AttributeError):
2600 return default
2601
2602
2603 def parse_duration(s):
2604 if not isinstance(s, str):
2605 return None
2606 s = s.strip()
2607 if not s:
2608 return None
2609
2610 days, hours, mins, secs, ms = [None] * 5
2611 m = re.match(r'''(?x)
2612 (?P<before_secs>
2613 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2614 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2615 (?P<ms>[.:][0-9]+)?Z?$
2616 ''', s)
2617 if m:
2618 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2619 else:
2620 m = re.match(
2621 r'''(?ix)(?:P?
2622 (?:
2623 [0-9]+\s*y(?:ears?)?,?\s*
2624 )?
2625 (?:
2626 [0-9]+\s*m(?:onths?)?,?\s*
2627 )?
2628 (?:
2629 [0-9]+\s*w(?:eeks?)?,?\s*
2630 )?
2631 (?:
2632 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2633 )?
2634 T)?
2635 (?:
2636 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2637 )?
2638 (?:
2639 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2640 )?
2641 (?:
2642 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2643 )?Z?$''', s)
2644 if m:
2645 days, hours, mins, secs, ms = m.groups()
2646 else:
2647 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2648 if m:
2649 hours, mins = m.groups()
2650 else:
2651 return None
2652
2653 if ms:
2654 ms = ms.replace(':', '.')
2655 return sum(float(part or 0) * mult for part, mult in (
2656 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2657
2658
2659 def prepend_extension(filename, ext, expected_real_ext=None):
2660 name, real_ext = os.path.splitext(filename)
2661 return (
2662 f'{name}.{ext}{real_ext}'
2663 if not expected_real_ext or real_ext[1:] == expected_real_ext
2664 else f'{filename}.{ext}')
2665
2666
2667 def replace_extension(filename, ext, expected_real_ext=None):
2668 name, real_ext = os.path.splitext(filename)
2669 return '{}.{}'.format(
2670 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2671 ext)
2672
2673
2674 def check_executable(exe, args=[]):
2675 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2676 args can be a list of arguments for a short output (like -version) """
2677 try:
2678 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2679 except OSError:
2680 return False
2681 return exe
2682
2683
2684 def _get_exe_version_output(exe, args):
2685 try:
2686 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2687 # SIGTTOU if yt-dlp is run in the background.
2688 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2689 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2690 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2691 if ret:
2692 return None
2693 except OSError:
2694 return False
2695 return stdout
2696
2697
2698 def detect_exe_version(output, version_re=None, unrecognized='present'):
2699 assert isinstance(output, str)
2700 if version_re is None:
2701 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2702 m = re.search(version_re, output)
2703 if m:
2704 return m.group(1)
2705 else:
2706 return unrecognized
2707
2708
2709 def get_exe_version(exe, args=['--version'],
2710 version_re=None, unrecognized=('present', 'broken')):
2711 """ Returns the version of the specified executable,
2712 or False if the executable is not present """
2713 unrecognized = variadic(unrecognized)
2714 assert len(unrecognized) in (1, 2)
2715 out = _get_exe_version_output(exe, args)
2716 if out is None:
2717 return unrecognized[-1]
2718 return out and detect_exe_version(out, version_re, unrecognized[0])
2719
2720
2721 def frange(start=0, stop=None, step=1):
2722 """Float range"""
2723 if stop is None:
2724 start, stop = 0, start
2725 sign = [-1, 1][step > 0] if step else 0
2726 while sign * start < sign * stop:
2727 yield start
2728 start += step
2729
2730
2731 class LazyList(collections.abc.Sequence):
2732 """Lazy immutable list from an iterable
2733 Note that slices of a LazyList are lists and not LazyList"""
2734
2735 class IndexError(IndexError):
2736 pass
2737
2738 def __init__(self, iterable, *, reverse=False, _cache=None):
2739 self._iterable = iter(iterable)
2740 self._cache = [] if _cache is None else _cache
2741 self._reversed = reverse
2742
2743 def __iter__(self):
2744 if self._reversed:
2745 # We need to consume the entire iterable to iterate in reverse
2746 yield from self.exhaust()
2747 return
2748 yield from self._cache
2749 for item in self._iterable:
2750 self._cache.append(item)
2751 yield item
2752
2753 def _exhaust(self):
2754 self._cache.extend(self._iterable)
2755 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2756 return self._cache
2757
2758 def exhaust(self):
2759 """Evaluate the entire iterable"""
2760 return self._exhaust()[::-1 if self._reversed else 1]
2761
2762 @staticmethod
2763 def _reverse_index(x):
2764 return None if x is None else ~x
2765
2766 def __getitem__(self, idx):
2767 if isinstance(idx, slice):
2768 if self._reversed:
2769 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2770 start, stop, step = idx.start, idx.stop, idx.step or 1
2771 elif isinstance(idx, int):
2772 if self._reversed:
2773 idx = self._reverse_index(idx)
2774 start, stop, step = idx, idx, 0
2775 else:
2776 raise TypeError('indices must be integers or slices')
2777 if ((start or 0) < 0 or (stop or 0) < 0
2778 or (start is None and step < 0)
2779 or (stop is None and step > 0)):
2780 # We need to consume the entire iterable to be able to slice from the end
2781 # Obviously, never use this with infinite iterables
2782 self._exhaust()
2783 try:
2784 return self._cache[idx]
2785 except IndexError as e:
2786 raise self.IndexError(e) from e
2787 n = max(start or 0, stop or 0) - len(self._cache) + 1
2788 if n > 0:
2789 self._cache.extend(itertools.islice(self._iterable, n))
2790 try:
2791 return self._cache[idx]
2792 except IndexError as e:
2793 raise self.IndexError(e) from e
2794
2795 def __bool__(self):
2796 try:
2797 self[-1] if self._reversed else self[0]
2798 except self.IndexError:
2799 return False
2800 return True
2801
2802 def __len__(self):
2803 self._exhaust()
2804 return len(self._cache)
2805
2806 def __reversed__(self):
2807 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2808
2809 def __copy__(self):
2810 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2811
2812 def __repr__(self):
2813 # repr and str should mimic a list. So we exhaust the iterable
2814 return repr(self.exhaust())
2815
2816 def __str__(self):
2817 return repr(self.exhaust())
2818
2819
2820 class PagedList:
2821
2822 class IndexError(IndexError):
2823 pass
2824
2825 def __len__(self):
2826 # This is only useful for tests
2827 return len(self.getslice())
2828
2829 def __init__(self, pagefunc, pagesize, use_cache=True):
2830 self._pagefunc = pagefunc
2831 self._pagesize = pagesize
2832 self._pagecount = float('inf')
2833 self._use_cache = use_cache
2834 self._cache = {}
2835
2836 def getpage(self, pagenum):
2837 page_results = self._cache.get(pagenum)
2838 if page_results is None:
2839 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2840 if self._use_cache:
2841 self._cache[pagenum] = page_results
2842 return page_results
2843
2844 def getslice(self, start=0, end=None):
2845 return list(self._getslice(start, end))
2846
2847 def _getslice(self, start, end):
2848 raise NotImplementedError('This method must be implemented by subclasses')
2849
2850 def __getitem__(self, idx):
2851 assert self._use_cache, 'Indexing PagedList requires cache'
2852 if not isinstance(idx, int) or idx < 0:
2853 raise TypeError('indices must be non-negative integers')
2854 entries = self.getslice(idx, idx + 1)
2855 if not entries:
2856 raise self.IndexError()
2857 return entries[0]
2858
2859
2860 class OnDemandPagedList(PagedList):
2861 """Download pages until a page with less than maximum results"""
2862
2863 def _getslice(self, start, end):
2864 for pagenum in itertools.count(start // self._pagesize):
2865 firstid = pagenum * self._pagesize
2866 nextfirstid = pagenum * self._pagesize + self._pagesize
2867 if start >= nextfirstid:
2868 continue
2869
2870 startv = (
2871 start % self._pagesize
2872 if firstid <= start < nextfirstid
2873 else 0)
2874 endv = (
2875 ((end - 1) % self._pagesize) + 1
2876 if (end is not None and firstid <= end <= nextfirstid)
2877 else None)
2878
2879 try:
2880 page_results = self.getpage(pagenum)
2881 except Exception:
2882 self._pagecount = pagenum - 1
2883 raise
2884 if startv != 0 or endv is not None:
2885 page_results = page_results[startv:endv]
2886 yield from page_results
2887
2888 # A little optimization - if current page is not "full", ie. does
2889 # not contain page_size videos then we can assume that this page
2890 # is the last one - there are no more ids on further pages -
2891 # i.e. no need to query again.
2892 if len(page_results) + startv < self._pagesize:
2893 break
2894
2895 # If we got the whole page, but the next page is not interesting,
2896 # break out early as well
2897 if end == nextfirstid:
2898 break
2899
2900
2901 class InAdvancePagedList(PagedList):
2902 """PagedList with total number of pages known in advance"""
2903
2904 def __init__(self, pagefunc, pagecount, pagesize):
2905 PagedList.__init__(self, pagefunc, pagesize, True)
2906 self._pagecount = pagecount
2907
2908 def _getslice(self, start, end):
2909 start_page = start // self._pagesize
2910 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2911 skip_elems = start - start_page * self._pagesize
2912 only_more = None if end is None else end - start
2913 for pagenum in range(start_page, end_page):
2914 page_results = self.getpage(pagenum)
2915 if skip_elems:
2916 page_results = page_results[skip_elems:]
2917 skip_elems = None
2918 if only_more is not None:
2919 if len(page_results) < only_more:
2920 only_more -= len(page_results)
2921 else:
2922 yield from page_results[:only_more]
2923 break
2924 yield from page_results
2925
2926
2927 class PlaylistEntries:
2928 MissingEntry = object()
2929 is_exhausted = False
2930
2931 def __init__(self, ydl, info_dict):
2932 self.ydl = ydl
2933
2934 # _entries must be assigned now since infodict can change during iteration
2935 entries = info_dict.get('entries')
2936 if entries is None:
2937 raise EntryNotInPlaylist('There are no entries')
2938 elif isinstance(entries, list):
2939 self.is_exhausted = True
2940
2941 requested_entries = info_dict.get('requested_entries')
2942 self.is_incomplete = requested_entries is not None
2943 if self.is_incomplete:
2944 assert self.is_exhausted
2945 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2946 for i, entry in zip(requested_entries, entries):
2947 self._entries[i - 1] = entry
2948 elif isinstance(entries, (list, PagedList, LazyList)):
2949 self._entries = entries
2950 else:
2951 self._entries = LazyList(entries)
2952
2953 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2954 (?P<start>[+-]?\d+)?
2955 (?P<range>[:-]
2956 (?P<end>[+-]?\d+|inf(?:inite)?)?
2957 (?::(?P<step>[+-]?\d+))?
2958 )?''')
2959
2960 @classmethod
2961 def parse_playlist_items(cls, string):
2962 for segment in string.split(','):
2963 if not segment:
2964 raise ValueError('There is two or more consecutive commas')
2965 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2966 if not mobj:
2967 raise ValueError(f'{segment!r} is not a valid specification')
2968 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2969 if int_or_none(step) == 0:
2970 raise ValueError(f'Step in {segment!r} cannot be zero')
2971 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2972
2973 def get_requested_items(self):
2974 playlist_items = self.ydl.params.get('playlist_items')
2975 playlist_start = self.ydl.params.get('playliststart', 1)
2976 playlist_end = self.ydl.params.get('playlistend')
2977 # For backwards compatibility, interpret -1 as whole list
2978 if playlist_end in (-1, None):
2979 playlist_end = ''
2980 if not playlist_items:
2981 playlist_items = f'{playlist_start}:{playlist_end}'
2982 elif playlist_start != 1 or playlist_end:
2983 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2984
2985 for index in self.parse_playlist_items(playlist_items):
2986 for i, entry in self[index]:
2987 yield i, entry
2988 if not entry:
2989 continue
2990 try:
2991 # The item may have just been added to archive. Don't break due to it
2992 if not self.ydl.params.get('lazy_playlist'):
2993 # TODO: Add auto-generated fields
2994 self.ydl._match_entry(entry, incomplete=True, silent=True)
2995 except (ExistingVideoReached, RejectedVideoReached):
2996 return
2997
2998 def get_full_count(self):
2999 if self.is_exhausted and not self.is_incomplete:
3000 return len(self)
3001 elif isinstance(self._entries, InAdvancePagedList):
3002 if self._entries._pagesize == 1:
3003 return self._entries._pagecount
3004
3005 @functools.cached_property
3006 def _getter(self):
3007 if isinstance(self._entries, list):
3008 def get_entry(i):
3009 try:
3010 entry = self._entries[i]
3011 except IndexError:
3012 entry = self.MissingEntry
3013 if not self.is_incomplete:
3014 raise self.IndexError()
3015 if entry is self.MissingEntry:
3016 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3017 return entry
3018 else:
3019 def get_entry(i):
3020 try:
3021 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3022 except (LazyList.IndexError, PagedList.IndexError):
3023 raise self.IndexError()
3024 return get_entry
3025
3026 def __getitem__(self, idx):
3027 if isinstance(idx, int):
3028 idx = slice(idx, idx)
3029
3030 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3031 step = 1 if idx.step is None else idx.step
3032 if idx.start is None:
3033 start = 0 if step > 0 else len(self) - 1
3034 else:
3035 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3036
3037 # NB: Do not call len(self) when idx == [:]
3038 if idx.stop is None:
3039 stop = 0 if step < 0 else float('inf')
3040 else:
3041 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3042 stop += [-1, 1][step > 0]
3043
3044 for i in frange(start, stop, step):
3045 if i < 0:
3046 continue
3047 try:
3048 entry = self._getter(i)
3049 except self.IndexError:
3050 self.is_exhausted = True
3051 if step > 0:
3052 break
3053 continue
3054 yield i + 1, entry
3055
3056 def __len__(self):
3057 return len(tuple(self[:]))
3058
3059 class IndexError(IndexError):
3060 pass
3061
3062
3063 def uppercase_escape(s):
3064 unicode_escape = codecs.getdecoder('unicode_escape')
3065 return re.sub(
3066 r'\\U[0-9a-fA-F]{8}',
3067 lambda m: unicode_escape(m.group(0))[0],
3068 s)
3069
3070
3071 def lowercase_escape(s):
3072 unicode_escape = codecs.getdecoder('unicode_escape')
3073 return re.sub(
3074 r'\\u[0-9a-fA-F]{4}',
3075 lambda m: unicode_escape(m.group(0))[0],
3076 s)
3077
3078
3079 def escape_rfc3986(s):
3080 """Escape non-ASCII characters as suggested by RFC 3986"""
3081 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3082
3083
3084 def escape_url(url):
3085 """Escape URL as suggested by RFC 3986"""
3086 url_parsed = urllib.parse.urlparse(url)
3087 return url_parsed._replace(
3088 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3089 path=escape_rfc3986(url_parsed.path),
3090 params=escape_rfc3986(url_parsed.params),
3091 query=escape_rfc3986(url_parsed.query),
3092 fragment=escape_rfc3986(url_parsed.fragment)
3093 ).geturl()
3094
3095
3096 def parse_qs(url, **kwargs):
3097 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3098
3099
3100 def read_batch_urls(batch_fd):
3101 def fixup(url):
3102 if not isinstance(url, str):
3103 url = url.decode('utf-8', 'replace')
3104 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3105 for bom in BOM_UTF8:
3106 if url.startswith(bom):
3107 url = url[len(bom):]
3108 url = url.lstrip()
3109 if not url or url.startswith(('#', ';', ']')):
3110 return False
3111 # "#" cannot be stripped out since it is part of the URI
3112 # However, it can be safely stripped out if following a whitespace
3113 return re.split(r'\s#', url, 1)[0].rstrip()
3114
3115 with contextlib.closing(batch_fd) as fd:
3116 return [url for url in map(fixup, fd) if url]
3117
3118
3119 def urlencode_postdata(*args, **kargs):
3120 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3121
3122
3123 def update_url(url, *, query_update=None, **kwargs):
3124 """Replace URL components specified by kwargs
3125 @param url str or parse url tuple
3126 @param query_update update query
3127 @returns str
3128 """
3129 if isinstance(url, str):
3130 if not kwargs and not query_update:
3131 return url
3132 else:
3133 url = urllib.parse.urlparse(url)
3134 if query_update:
3135 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3136 kwargs['query'] = urllib.parse.urlencode({
3137 **urllib.parse.parse_qs(url.query),
3138 **query_update
3139 }, True)
3140 return urllib.parse.urlunparse(url._replace(**kwargs))
3141
3142
3143 def update_url_query(url, query):
3144 return update_url(url, query_update=query)
3145
3146
3147 def update_Request(req, url=None, data=None, headers=None, query=None):
3148 req_headers = req.headers.copy()
3149 req_headers.update(headers or {})
3150 req_data = data or req.data
3151 req_url = update_url_query(url or req.get_full_url(), query)
3152 req_get_method = req.get_method()
3153 if req_get_method == 'HEAD':
3154 req_type = HEADRequest
3155 elif req_get_method == 'PUT':
3156 req_type = PUTRequest
3157 else:
3158 req_type = urllib.request.Request
3159 new_req = req_type(
3160 req_url, data=req_data, headers=req_headers,
3161 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3162 if hasattr(req, 'timeout'):
3163 new_req.timeout = req.timeout
3164 return new_req
3165
3166
3167 def _multipart_encode_impl(data, boundary):
3168 content_type = 'multipart/form-data; boundary=%s' % boundary
3169
3170 out = b''
3171 for k, v in data.items():
3172 out += b'--' + boundary.encode('ascii') + b'\r\n'
3173 if isinstance(k, str):
3174 k = k.encode()
3175 if isinstance(v, str):
3176 v = v.encode()
3177 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3178 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3179 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3180 if boundary.encode('ascii') in content:
3181 raise ValueError('Boundary overlaps with data')
3182 out += content
3183
3184 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3185
3186 return out, content_type
3187
3188
3189 def multipart_encode(data, boundary=None):
3190 '''
3191 Encode a dict to RFC 7578-compliant form-data
3192
3193 data:
3194 A dict where keys and values can be either Unicode or bytes-like
3195 objects.
3196 boundary:
3197 If specified a Unicode object, it's used as the boundary. Otherwise
3198 a random boundary is generated.
3199
3200 Reference: https://tools.ietf.org/html/rfc7578
3201 '''
3202 has_specified_boundary = boundary is not None
3203
3204 while True:
3205 if boundary is None:
3206 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3207
3208 try:
3209 out, content_type = _multipart_encode_impl(data, boundary)
3210 break
3211 except ValueError:
3212 if has_specified_boundary:
3213 raise
3214 boundary = None
3215
3216 return out, content_type
3217
3218
3219 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3220 if blocked_types is NO_DEFAULT:
3221 blocked_types = (str, bytes, collections.abc.Mapping)
3222 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3223
3224
3225 def variadic(x, allowed_types=NO_DEFAULT):
3226 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3227
3228
3229 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3230 for f in funcs:
3231 try:
3232 val = f(*args, **kwargs)
3233 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3234 pass
3235 else:
3236 if expected_type is None or isinstance(val, expected_type):
3237 return val
3238
3239
3240 def try_get(src, getter, expected_type=None):
3241 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3242
3243
3244 def filter_dict(dct, cndn=lambda _, v: v is not None):
3245 return {k: v for k, v in dct.items() if cndn(k, v)}
3246
3247
3248 def merge_dicts(*dicts):
3249 merged = {}
3250 for a_dict in dicts:
3251 for k, v in a_dict.items():
3252 if (v is not None and k not in merged
3253 or isinstance(v, str) and merged[k] == ''):
3254 merged[k] = v
3255 return merged
3256
3257
3258 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3259 return string if isinstance(string, str) else str(string, encoding, errors)
3260
3261
3262 US_RATINGS = {
3263 'G': 0,
3264 'PG': 10,
3265 'PG-13': 13,
3266 'R': 16,
3267 'NC': 18,
3268 }
3269
3270
3271 TV_PARENTAL_GUIDELINES = {
3272 'TV-Y': 0,
3273 'TV-Y7': 7,
3274 'TV-G': 0,
3275 'TV-PG': 0,
3276 'TV-14': 14,
3277 'TV-MA': 17,
3278 }
3279
3280
3281 def parse_age_limit(s):
3282 # isinstance(False, int) is True. So type() must be used instead
3283 if type(s) is int: # noqa: E721
3284 return s if 0 <= s <= 21 else None
3285 elif not isinstance(s, str):
3286 return None
3287 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3288 if m:
3289 return int(m.group('age'))
3290 s = s.upper()
3291 if s in US_RATINGS:
3292 return US_RATINGS[s]
3293 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3294 if m:
3295 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3296 return None
3297
3298
3299 def strip_jsonp(code):
3300 return re.sub(
3301 r'''(?sx)^
3302 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3303 (?:\s*&&\s*(?P=func_name))?
3304 \s*\(\s*(?P<callback_data>.*)\);?
3305 \s*?(?://[^\n]*)*$''',
3306 r'\g<callback_data>', code)
3307
3308
3309 def js_to_json(code, vars={}, *, strict=False):
3310 # vars is a dict of var, val pairs to substitute
3311 STRING_QUOTES = '\'"`'
3312 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3313 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3314 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3315 INTEGER_TABLE = (
3316 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3317 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3318 )
3319
3320 def process_escape(match):
3321 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3322 escape = match.group(1) or match.group(2)
3323
3324 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3325 else R'\u00' if escape == 'x'
3326 else '' if escape == '\n'
3327 else escape)
3328
3329 def template_substitute(match):
3330 evaluated = js_to_json(match.group(1), vars, strict=strict)
3331 if evaluated[0] == '"':
3332 return json.loads(evaluated)
3333 return evaluated
3334
3335 def fix_kv(m):
3336 v = m.group(0)
3337 if v in ('true', 'false', 'null'):
3338 return v
3339 elif v in ('undefined', 'void 0'):
3340 return 'null'
3341 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3342 return ''
3343
3344 if v[0] in STRING_QUOTES:
3345 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3346 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3347 return f'"{escaped}"'
3348
3349 for regex, base in INTEGER_TABLE:
3350 im = re.match(regex, v)
3351 if im:
3352 i = int(im.group(1), base)
3353 return f'"{i}":' if v.endswith(':') else str(i)
3354
3355 if v in vars:
3356 try:
3357 if not strict:
3358 json.loads(vars[v])
3359 except json.JSONDecodeError:
3360 return json.dumps(vars[v])
3361 else:
3362 return vars[v]
3363
3364 if not strict:
3365 return f'"{v}"'
3366
3367 raise ValueError(f'Unknown value: {v}')
3368
3369 def create_map(mobj):
3370 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3371
3372 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3373 if not strict:
3374 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3375 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3376 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3377 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3378
3379 return re.sub(rf'''(?sx)
3380 {STRING_RE}|
3381 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3382 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3383 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3384 [0-9]+(?={SKIP_RE}:)|
3385 !+
3386 ''', fix_kv, code)
3387
3388
3389 def qualities(quality_ids):
3390 """ Get a numeric quality value out of a list of possible values """
3391 def q(qid):
3392 try:
3393 return quality_ids.index(qid)
3394 except ValueError:
3395 return -1
3396 return q
3397
3398
3399 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3400
3401
3402 DEFAULT_OUTTMPL = {
3403 'default': '%(title)s [%(id)s].%(ext)s',
3404 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3405 }
3406 OUTTMPL_TYPES = {
3407 'chapter': None,
3408 'subtitle': None,
3409 'thumbnail': None,
3410 'description': 'description',
3411 'annotation': 'annotations.xml',
3412 'infojson': 'info.json',
3413 'link': None,
3414 'pl_video': None,
3415 'pl_thumbnail': None,
3416 'pl_description': 'description',
3417 'pl_infojson': 'info.json',
3418 }
3419
3420 # As of [1] format syntax is:
3421 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3422 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3423 STR_FORMAT_RE_TMPL = r'''(?x)
3424 (?<!%)(?P<prefix>(?:%%)*)
3425 %
3426 (?P<has_key>\((?P<key>{0})\))?
3427 (?P<format>
3428 (?P<conversion>[#0\-+ ]+)?
3429 (?P<min_width>\d+)?
3430 (?P<precision>\.\d+)?
3431 (?P<len_mod>[hlL])? # unused in python
3432 {1} # conversion type
3433 )
3434 '''
3435
3436
3437 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3438
3439
3440 def limit_length(s, length):
3441 """ Add ellipses to overly long strings """
3442 if s is None:
3443 return None
3444 ELLIPSES = '...'
3445 if len(s) > length:
3446 return s[:length - len(ELLIPSES)] + ELLIPSES
3447 return s
3448
3449
3450 def version_tuple(v):
3451 return tuple(int(e) for e in re.split(r'[-.]', v))
3452
3453
3454 def is_outdated_version(version, limit, assume_new=True):
3455 if not version:
3456 return not assume_new
3457 try:
3458 return version_tuple(version) < version_tuple(limit)
3459 except ValueError:
3460 return not assume_new
3461
3462
3463 def ytdl_is_updateable():
3464 """ Returns if yt-dlp can be updated with -U """
3465
3466 from ..update import is_non_updateable
3467
3468 return not is_non_updateable()
3469
3470
3471 def args_to_str(args):
3472 # Get a short string representation for a subprocess command
3473 return ' '.join(compat_shlex_quote(a) for a in args)
3474
3475
3476 def error_to_str(err):
3477 return f'{type(err).__name__}: {err}'
3478
3479
3480 def mimetype2ext(mt, default=NO_DEFAULT):
3481 if not isinstance(mt, str):
3482 if default is not NO_DEFAULT:
3483 return default
3484 return None
3485
3486 MAP = {
3487 # video
3488 '3gpp': '3gp',
3489 'mp2t': 'ts',
3490 'mp4': 'mp4',
3491 'mpeg': 'mpeg',
3492 'mpegurl': 'm3u8',
3493 'quicktime': 'mov',
3494 'webm': 'webm',
3495 'vp9': 'vp9',
3496 'x-flv': 'flv',
3497 'x-m4v': 'm4v',
3498 'x-matroska': 'mkv',
3499 'x-mng': 'mng',
3500 'x-mp4-fragmented': 'mp4',
3501 'x-ms-asf': 'asf',
3502 'x-ms-wmv': 'wmv',
3503 'x-msvideo': 'avi',
3504
3505 # application (streaming playlists)
3506 'dash+xml': 'mpd',
3507 'f4m+xml': 'f4m',
3508 'hds+xml': 'f4m',
3509 'vnd.apple.mpegurl': 'm3u8',
3510 'vnd.ms-sstr+xml': 'ism',
3511 'x-mpegurl': 'm3u8',
3512
3513 # audio
3514 'audio/mp4': 'm4a',
3515 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3516 # Using .mp3 as it's the most popular one
3517 'audio/mpeg': 'mp3',
3518 'audio/webm': 'webm',
3519 'audio/x-matroska': 'mka',
3520 'audio/x-mpegurl': 'm3u',
3521 'midi': 'mid',
3522 'ogg': 'ogg',
3523 'wav': 'wav',
3524 'wave': 'wav',
3525 'x-aac': 'aac',
3526 'x-flac': 'flac',
3527 'x-m4a': 'm4a',
3528 'x-realaudio': 'ra',
3529 'x-wav': 'wav',
3530
3531 # image
3532 'avif': 'avif',
3533 'bmp': 'bmp',
3534 'gif': 'gif',
3535 'jpeg': 'jpg',
3536 'png': 'png',
3537 'svg+xml': 'svg',
3538 'tiff': 'tif',
3539 'vnd.wap.wbmp': 'wbmp',
3540 'webp': 'webp',
3541 'x-icon': 'ico',
3542 'x-jng': 'jng',
3543 'x-ms-bmp': 'bmp',
3544
3545 # caption
3546 'filmstrip+json': 'fs',
3547 'smptett+xml': 'tt',
3548 'ttaf+xml': 'dfxp',
3549 'ttml+xml': 'ttml',
3550 'x-ms-sami': 'sami',
3551
3552 # misc
3553 'gzip': 'gz',
3554 'json': 'json',
3555 'xml': 'xml',
3556 'zip': 'zip',
3557 }
3558
3559 mimetype = mt.partition(';')[0].strip().lower()
3560 _, _, subtype = mimetype.rpartition('/')
3561
3562 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3563 if ext:
3564 return ext
3565 elif default is not NO_DEFAULT:
3566 return default
3567 return subtype.replace('+', '.')
3568
3569
3570 def ext2mimetype(ext_or_url):
3571 if not ext_or_url:
3572 return None
3573 if '.' not in ext_or_url:
3574 ext_or_url = f'file.{ext_or_url}'
3575 return mimetypes.guess_type(ext_or_url)[0]
3576
3577
3578 def parse_codecs(codecs_str):
3579 # http://tools.ietf.org/html/rfc6381
3580 if not codecs_str:
3581 return {}
3582 split_codecs = list(filter(None, map(
3583 str.strip, codecs_str.strip().strip(',').split(','))))
3584 vcodec, acodec, scodec, hdr = None, None, None, None
3585 for full_codec in split_codecs:
3586 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3587 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3588 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3589 if vcodec:
3590 continue
3591 vcodec = full_codec
3592 if parts[0] in ('dvh1', 'dvhe'):
3593 hdr = 'DV'
3594 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3595 hdr = 'HDR10'
3596 elif parts[:2] == ['vp9', '2']:
3597 hdr = 'HDR10'
3598 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3599 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3600 acodec = acodec or full_codec
3601 elif parts[0] in ('stpp', 'wvtt'):
3602 scodec = scodec or full_codec
3603 else:
3604 write_string(f'WARNING: Unknown codec {full_codec}\n')
3605 if vcodec or acodec or scodec:
3606 return {
3607 'vcodec': vcodec or 'none',
3608 'acodec': acodec or 'none',
3609 'dynamic_range': hdr,
3610 **({'scodec': scodec} if scodec is not None else {}),
3611 }
3612 elif len(split_codecs) == 2:
3613 return {
3614 'vcodec': split_codecs[0],
3615 'acodec': split_codecs[1],
3616 }
3617 return {}
3618
3619
3620 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3621 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3622
3623 allow_mkv = not preferences or 'mkv' in preferences
3624
3625 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3626 return 'mkv' # TODO: any other format allows this?
3627
3628 # TODO: All codecs supported by parse_codecs isn't handled here
3629 COMPATIBLE_CODECS = {
3630 'mp4': {
3631 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3632 'h264', 'aacl', 'ec-3', # Set in ISM
3633 },
3634 'webm': {
3635 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3636 'vp9x', 'vp8x', # in the webm spec
3637 },
3638 }
3639
3640 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3641 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3642
3643 for ext in preferences or COMPATIBLE_CODECS.keys():
3644 codec_set = COMPATIBLE_CODECS.get(ext, set())
3645 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3646 return ext
3647
3648 COMPATIBLE_EXTS = (
3649 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3650 {'webm', 'weba'},
3651 )
3652 for ext in preferences or vexts:
3653 current_exts = {ext, *vexts, *aexts}
3654 if ext == 'mkv' or current_exts == {ext} or any(
3655 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3656 return ext
3657 return 'mkv' if allow_mkv else preferences[-1]
3658
3659
3660 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3661 getheader = url_handle.headers.get
3662
3663 cd = getheader('Content-Disposition')
3664 if cd:
3665 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3666 if m:
3667 e = determine_ext(m.group('filename'), default_ext=None)
3668 if e:
3669 return e
3670
3671 meta_ext = getheader('x-amz-meta-name')
3672 if meta_ext:
3673 e = meta_ext.rpartition('.')[2]
3674 if e:
3675 return e
3676
3677 return mimetype2ext(getheader('Content-Type'), default=default)
3678
3679
3680 def encode_data_uri(data, mime_type):
3681 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3682
3683
3684 def age_restricted(content_limit, age_limit):
3685 """ Returns True iff the content should be blocked """
3686
3687 if age_limit is None: # No limit set
3688 return False
3689 if content_limit is None:
3690 return False # Content available for everyone
3691 return age_limit < content_limit
3692
3693
3694 # List of known byte-order-marks (BOM)
3695 BOMS = [
3696 (b'\xef\xbb\xbf', 'utf-8'),
3697 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3698 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3699 (b'\xff\xfe', 'utf-16-le'),
3700 (b'\xfe\xff', 'utf-16-be'),
3701 ]
3702
3703
3704 def is_html(first_bytes):
3705 """ Detect whether a file contains HTML by examining its first bytes. """
3706
3707 encoding = 'utf-8'
3708 for bom, enc in BOMS:
3709 while first_bytes.startswith(bom):
3710 encoding, first_bytes = enc, first_bytes[len(bom):]
3711
3712 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3713
3714
3715 def determine_protocol(info_dict):
3716 protocol = info_dict.get('protocol')
3717 if protocol is not None:
3718 return protocol
3719
3720 url = sanitize_url(info_dict['url'])
3721 if url.startswith('rtmp'):
3722 return 'rtmp'
3723 elif url.startswith('mms'):
3724 return 'mms'
3725 elif url.startswith('rtsp'):
3726 return 'rtsp'
3727
3728 ext = determine_ext(url)
3729 if ext == 'm3u8':
3730 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3731 elif ext == 'f4m':
3732 return 'f4m'
3733
3734 return urllib.parse.urlparse(url).scheme
3735
3736
3737 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3738 """ Render a list of rows, each as a list of values.
3739 Text after a \t will be right aligned """
3740 def width(string):
3741 return len(remove_terminal_sequences(string).replace('\t', ''))
3742
3743 def get_max_lens(table):
3744 return [max(width(str(v)) for v in col) for col in zip(*table)]
3745
3746 def filter_using_list(row, filterArray):
3747 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3748
3749 max_lens = get_max_lens(data) if hide_empty else []
3750 header_row = filter_using_list(header_row, max_lens)
3751 data = [filter_using_list(row, max_lens) for row in data]
3752
3753 table = [header_row] + data
3754 max_lens = get_max_lens(table)
3755 extra_gap += 1
3756 if delim:
3757 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3758 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3759 for row in table:
3760 for pos, text in enumerate(map(str, row)):
3761 if '\t' in text:
3762 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3763 else:
3764 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3765 ret = '\n'.join(''.join(row).rstrip() for row in table)
3766 return ret
3767
3768
3769 def _match_one(filter_part, dct, incomplete):
3770 # TODO: Generalize code with YoutubeDL._build_format_filter
3771 STRING_OPERATORS = {
3772 '*=': operator.contains,
3773 '^=': lambda attr, value: attr.startswith(value),
3774 '$=': lambda attr, value: attr.endswith(value),
3775 '~=': lambda attr, value: re.search(value, attr),
3776 }
3777 COMPARISON_OPERATORS = {
3778 **STRING_OPERATORS,
3779 '<=': operator.le, # "<=" must be defined above "<"
3780 '<': operator.lt,
3781 '>=': operator.ge,
3782 '>': operator.gt,
3783 '=': operator.eq,
3784 }
3785
3786 if isinstance(incomplete, bool):
3787 is_incomplete = lambda _: incomplete
3788 else:
3789 is_incomplete = lambda k: k in incomplete
3790
3791 operator_rex = re.compile(r'''(?x)
3792 (?P<key>[a-z_]+)
3793 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3794 (?:
3795 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3796 (?P<strval>.+?)
3797 )
3798 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3799 m = operator_rex.fullmatch(filter_part.strip())
3800 if m:
3801 m = m.groupdict()
3802 unnegated_op = COMPARISON_OPERATORS[m['op']]
3803 if m['negation']:
3804 op = lambda attr, value: not unnegated_op(attr, value)
3805 else:
3806 op = unnegated_op
3807 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3808 if m['quote']:
3809 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3810 actual_value = dct.get(m['key'])
3811 numeric_comparison = None
3812 if isinstance(actual_value, (int, float)):
3813 # If the original field is a string and matching comparisonvalue is
3814 # a number we should respect the origin of the original field
3815 # and process comparison value as a string (see
3816 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3817 try:
3818 numeric_comparison = int(comparison_value)
3819 except ValueError:
3820 numeric_comparison = parse_filesize(comparison_value)
3821 if numeric_comparison is None:
3822 numeric_comparison = parse_filesize(f'{comparison_value}B')
3823 if numeric_comparison is None:
3824 numeric_comparison = parse_duration(comparison_value)
3825 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3826 raise ValueError('Operator %s only supports string values!' % m['op'])
3827 if actual_value is None:
3828 return is_incomplete(m['key']) or m['none_inclusive']
3829 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3830
3831 UNARY_OPERATORS = {
3832 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3833 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3834 }
3835 operator_rex = re.compile(r'''(?x)
3836 (?P<op>%s)\s*(?P<key>[a-z_]+)
3837 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3838 m = operator_rex.fullmatch(filter_part.strip())
3839 if m:
3840 op = UNARY_OPERATORS[m.group('op')]
3841 actual_value = dct.get(m.group('key'))
3842 if is_incomplete(m.group('key')) and actual_value is None:
3843 return True
3844 return op(actual_value)
3845
3846 raise ValueError('Invalid filter part %r' % filter_part)
3847
3848
3849 def match_str(filter_str, dct, incomplete=False):
3850 """ Filter a dictionary with a simple string syntax.
3851 @returns Whether the filter passes
3852 @param incomplete Set of keys that is expected to be missing from dct.
3853 Can be True/False to indicate all/none of the keys may be missing.
3854 All conditions on incomplete keys pass if the key is missing
3855 """
3856 return all(
3857 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3858 for filter_part in re.split(r'(?<!\\)&', filter_str))
3859
3860
3861 def match_filter_func(filters, breaking_filters=None):
3862 if not filters and not breaking_filters:
3863 return None
3864 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3865 filters = set(variadic(filters or []))
3866
3867 interactive = '-' in filters
3868 if interactive:
3869 filters.remove('-')
3870
3871 def _match_func(info_dict, incomplete=False):
3872 ret = breaking_filters(info_dict, incomplete)
3873 if ret is not None:
3874 raise RejectedVideoReached(ret)
3875
3876 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3877 return NO_DEFAULT if interactive and not incomplete else None
3878 else:
3879 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3880 filter_str = ') | ('.join(map(str.strip, filters))
3881 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3882 return _match_func
3883
3884
3885 class download_range_func:
3886 def __init__(self, chapters, ranges):
3887 self.chapters, self.ranges = chapters, ranges
3888
3889 def __call__(self, info_dict, ydl):
3890 if not self.ranges and not self.chapters:
3891 yield {}
3892
3893 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3894 else 'Cannot match chapters since chapter information is unavailable')
3895 for regex in self.chapters or []:
3896 for i, chapter in enumerate(info_dict.get('chapters') or []):
3897 if re.search(regex, chapter['title']):
3898 warning = None
3899 yield {**chapter, 'index': i}
3900 if self.chapters and warning:
3901 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3902
3903 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3904
3905 def __eq__(self, other):
3906 return (isinstance(other, download_range_func)
3907 and self.chapters == other.chapters and self.ranges == other.ranges)
3908
3909 def __repr__(self):
3910 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3911
3912
3913 def parse_dfxp_time_expr(time_expr):
3914 if not time_expr:
3915 return
3916
3917 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3918 if mobj:
3919 return float(mobj.group('time_offset'))
3920
3921 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3922 if mobj:
3923 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3924
3925
3926 def srt_subtitles_timecode(seconds):
3927 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3928
3929
3930 def ass_subtitles_timecode(seconds):
3931 time = timetuple_from_msec(seconds * 1000)
3932 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3933
3934
3935 def dfxp2srt(dfxp_data):
3936 '''
3937 @param dfxp_data A bytes-like object containing DFXP data
3938 @returns A unicode object containing converted SRT data
3939 '''
3940 LEGACY_NAMESPACES = (
3941 (b'http://www.w3.org/ns/ttml', [
3942 b'http://www.w3.org/2004/11/ttaf1',
3943 b'http://www.w3.org/2006/04/ttaf1',
3944 b'http://www.w3.org/2006/10/ttaf1',
3945 ]),
3946 (b'http://www.w3.org/ns/ttml#styling', [
3947 b'http://www.w3.org/ns/ttml#style',
3948 ]),
3949 )
3950
3951 SUPPORTED_STYLING = [
3952 'color',
3953 'fontFamily',
3954 'fontSize',
3955 'fontStyle',
3956 'fontWeight',
3957 'textDecoration'
3958 ]
3959
3960 _x = functools.partial(xpath_with_ns, ns_map={
3961 'xml': 'http://www.w3.org/XML/1998/namespace',
3962 'ttml': 'http://www.w3.org/ns/ttml',
3963 'tts': 'http://www.w3.org/ns/ttml#styling',
3964 })
3965
3966 styles = {}
3967 default_style = {}
3968
3969 class TTMLPElementParser:
3970 _out = ''
3971 _unclosed_elements = []
3972 _applied_styles = []
3973
3974 def start(self, tag, attrib):
3975 if tag in (_x('ttml:br'), 'br'):
3976 self._out += '\n'
3977 else:
3978 unclosed_elements = []
3979 style = {}
3980 element_style_id = attrib.get('style')
3981 if default_style:
3982 style.update(default_style)
3983 if element_style_id:
3984 style.update(styles.get(element_style_id, {}))
3985 for prop in SUPPORTED_STYLING:
3986 prop_val = attrib.get(_x('tts:' + prop))
3987 if prop_val:
3988 style[prop] = prop_val
3989 if style:
3990 font = ''
3991 for k, v in sorted(style.items()):
3992 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3993 continue
3994 if k == 'color':
3995 font += ' color="%s"' % v
3996 elif k == 'fontSize':
3997 font += ' size="%s"' % v
3998 elif k == 'fontFamily':
3999 font += ' face="%s"' % v
4000 elif k == 'fontWeight' and v == 'bold':
4001 self._out += '<b>'
4002 unclosed_elements.append('b')
4003 elif k == 'fontStyle' and v == 'italic':
4004 self._out += '<i>'
4005 unclosed_elements.append('i')
4006 elif k == 'textDecoration' and v == 'underline':
4007 self._out += '<u>'
4008 unclosed_elements.append('u')
4009 if font:
4010 self._out += '<font' + font + '>'
4011 unclosed_elements.append('font')
4012 applied_style = {}
4013 if self._applied_styles:
4014 applied_style.update(self._applied_styles[-1])
4015 applied_style.update(style)
4016 self._applied_styles.append(applied_style)
4017 self._unclosed_elements.append(unclosed_elements)
4018
4019 def end(self, tag):
4020 if tag not in (_x('ttml:br'), 'br'):
4021 unclosed_elements = self._unclosed_elements.pop()
4022 for element in reversed(unclosed_elements):
4023 self._out += '</%s>' % element
4024 if unclosed_elements and self._applied_styles:
4025 self._applied_styles.pop()
4026
4027 def data(self, data):
4028 self._out += data
4029
4030 def close(self):
4031 return self._out.strip()
4032
4033 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
4034 # This will not trigger false positives since only UTF-8 text is being replaced
4035 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
4036
4037 def parse_node(node):
4038 target = TTMLPElementParser()
4039 parser = xml.etree.ElementTree.XMLParser(target=target)
4040 parser.feed(xml.etree.ElementTree.tostring(node))
4041 return parser.close()
4042
4043 for k, v in LEGACY_NAMESPACES:
4044 for ns in v:
4045 dfxp_data = dfxp_data.replace(ns, k)
4046
4047 dfxp = compat_etree_fromstring(dfxp_data)
4048 out = []
4049 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4050
4051 if not paras:
4052 raise ValueError('Invalid dfxp/TTML subtitle')
4053
4054 repeat = False
4055 while True:
4056 for style in dfxp.findall(_x('.//ttml:style')):
4057 style_id = style.get('id') or style.get(_x('xml:id'))
4058 if not style_id:
4059 continue
4060 parent_style_id = style.get('style')
4061 if parent_style_id:
4062 if parent_style_id not in styles:
4063 repeat = True
4064 continue
4065 styles[style_id] = styles[parent_style_id].copy()
4066 for prop in SUPPORTED_STYLING:
4067 prop_val = style.get(_x('tts:' + prop))
4068 if prop_val:
4069 styles.setdefault(style_id, {})[prop] = prop_val
4070 if repeat:
4071 repeat = False
4072 else:
4073 break
4074
4075 for p in ('body', 'div'):
4076 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4077 if ele is None:
4078 continue
4079 style = styles.get(ele.get('style'))
4080 if not style:
4081 continue
4082 default_style.update(style)
4083
4084 for para, index in zip(paras, itertools.count(1)):
4085 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4086 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4087 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4088 if begin_time is None:
4089 continue
4090 if not end_time:
4091 if not dur:
4092 continue
4093 end_time = begin_time + dur
4094 out.append('%d\n%s --> %s\n%s\n\n' % (
4095 index,
4096 srt_subtitles_timecode(begin_time),
4097 srt_subtitles_timecode(end_time),
4098 parse_node(para)))
4099
4100 return ''.join(out)
4101
4102
4103 def cli_option(params, command_option, param, separator=None):
4104 param = params.get(param)
4105 return ([] if param is None
4106 else [command_option, str(param)] if separator is None
4107 else [f'{command_option}{separator}{param}'])
4108
4109
4110 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4111 param = params.get(param)
4112 assert param in (True, False, None)
4113 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4114
4115
4116 def cli_valueless_option(params, command_option, param, expected_value=True):
4117 return [command_option] if params.get(param) == expected_value else []
4118
4119
4120 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4121 if isinstance(argdict, (list, tuple)): # for backward compatibility
4122 if use_compat:
4123 return argdict
4124 else:
4125 argdict = None
4126 if argdict is None:
4127 return default
4128 assert isinstance(argdict, dict)
4129
4130 assert isinstance(keys, (list, tuple))
4131 for key_list in keys:
4132 arg_list = list(filter(
4133 lambda x: x is not None,
4134 [argdict.get(key.lower()) for key in variadic(key_list)]))
4135 if arg_list:
4136 return [arg for args in arg_list for arg in args]
4137 return default
4138
4139
4140 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4141 main_key, exe = main_key.lower(), exe.lower()
4142 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4143 keys = [f'{root_key}{k}' for k in (keys or [''])]
4144 if root_key in keys:
4145 if main_key != exe:
4146 keys.append((main_key, exe))
4147 keys.append('default')
4148 else:
4149 use_compat = False
4150 return cli_configuration_args(argdict, keys, default, use_compat)
4151
4152
4153 class ISO639Utils:
4154 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4155 _lang_map = {
4156 'aa': 'aar',
4157 'ab': 'abk',
4158 'ae': 'ave',
4159 'af': 'afr',
4160 'ak': 'aka',
4161 'am': 'amh',
4162 'an': 'arg',
4163 'ar': 'ara',
4164 'as': 'asm',
4165 'av': 'ava',
4166 'ay': 'aym',
4167 'az': 'aze',
4168 'ba': 'bak',
4169 'be': 'bel',
4170 'bg': 'bul',
4171 'bh': 'bih',
4172 'bi': 'bis',
4173 'bm': 'bam',
4174 'bn': 'ben',
4175 'bo': 'bod',
4176 'br': 'bre',
4177 'bs': 'bos',
4178 'ca': 'cat',
4179 'ce': 'che',
4180 'ch': 'cha',
4181 'co': 'cos',
4182 'cr': 'cre',
4183 'cs': 'ces',
4184 'cu': 'chu',
4185 'cv': 'chv',
4186 'cy': 'cym',
4187 'da': 'dan',
4188 'de': 'deu',
4189 'dv': 'div',
4190 'dz': 'dzo',
4191 'ee': 'ewe',
4192 'el': 'ell',
4193 'en': 'eng',
4194 'eo': 'epo',
4195 'es': 'spa',
4196 'et': 'est',
4197 'eu': 'eus',
4198 'fa': 'fas',
4199 'ff': 'ful',
4200 'fi': 'fin',
4201 'fj': 'fij',
4202 'fo': 'fao',
4203 'fr': 'fra',
4204 'fy': 'fry',
4205 'ga': 'gle',
4206 'gd': 'gla',
4207 'gl': 'glg',
4208 'gn': 'grn',
4209 'gu': 'guj',
4210 'gv': 'glv',
4211 'ha': 'hau',
4212 'he': 'heb',
4213 'iw': 'heb', # Replaced by he in 1989 revision
4214 'hi': 'hin',
4215 'ho': 'hmo',
4216 'hr': 'hrv',
4217 'ht': 'hat',
4218 'hu': 'hun',
4219 'hy': 'hye',
4220 'hz': 'her',
4221 'ia': 'ina',
4222 'id': 'ind',
4223 'in': 'ind', # Replaced by id in 1989 revision
4224 'ie': 'ile',
4225 'ig': 'ibo',
4226 'ii': 'iii',
4227 'ik': 'ipk',
4228 'io': 'ido',
4229 'is': 'isl',
4230 'it': 'ita',
4231 'iu': 'iku',
4232 'ja': 'jpn',
4233 'jv': 'jav',
4234 'ka': 'kat',
4235 'kg': 'kon',
4236 'ki': 'kik',
4237 'kj': 'kua',
4238 'kk': 'kaz',
4239 'kl': 'kal',
4240 'km': 'khm',
4241 'kn': 'kan',
4242 'ko': 'kor',
4243 'kr': 'kau',
4244 'ks': 'kas',
4245 'ku': 'kur',
4246 'kv': 'kom',
4247 'kw': 'cor',
4248 'ky': 'kir',
4249 'la': 'lat',
4250 'lb': 'ltz',
4251 'lg': 'lug',
4252 'li': 'lim',
4253 'ln': 'lin',
4254 'lo': 'lao',
4255 'lt': 'lit',
4256 'lu': 'lub',
4257 'lv': 'lav',
4258 'mg': 'mlg',
4259 'mh': 'mah',
4260 'mi': 'mri',
4261 'mk': 'mkd',
4262 'ml': 'mal',
4263 'mn': 'mon',
4264 'mr': 'mar',
4265 'ms': 'msa',
4266 'mt': 'mlt',
4267 'my': 'mya',
4268 'na': 'nau',
4269 'nb': 'nob',
4270 'nd': 'nde',
4271 'ne': 'nep',
4272 'ng': 'ndo',
4273 'nl': 'nld',
4274 'nn': 'nno',
4275 'no': 'nor',
4276 'nr': 'nbl',
4277 'nv': 'nav',
4278 'ny': 'nya',
4279 'oc': 'oci',
4280 'oj': 'oji',
4281 'om': 'orm',
4282 'or': 'ori',
4283 'os': 'oss',
4284 'pa': 'pan',
4285 'pi': 'pli',
4286 'pl': 'pol',
4287 'ps': 'pus',
4288 'pt': 'por',
4289 'qu': 'que',
4290 'rm': 'roh',
4291 'rn': 'run',
4292 'ro': 'ron',
4293 'ru': 'rus',
4294 'rw': 'kin',
4295 'sa': 'san',
4296 'sc': 'srd',
4297 'sd': 'snd',
4298 'se': 'sme',
4299 'sg': 'sag',
4300 'si': 'sin',
4301 'sk': 'slk',
4302 'sl': 'slv',
4303 'sm': 'smo',
4304 'sn': 'sna',
4305 'so': 'som',
4306 'sq': 'sqi',
4307 'sr': 'srp',
4308 'ss': 'ssw',
4309 'st': 'sot',
4310 'su': 'sun',
4311 'sv': 'swe',
4312 'sw': 'swa',
4313 'ta': 'tam',
4314 'te': 'tel',
4315 'tg': 'tgk',
4316 'th': 'tha',
4317 'ti': 'tir',
4318 'tk': 'tuk',
4319 'tl': 'tgl',
4320 'tn': 'tsn',
4321 'to': 'ton',
4322 'tr': 'tur',
4323 'ts': 'tso',
4324 'tt': 'tat',
4325 'tw': 'twi',
4326 'ty': 'tah',
4327 'ug': 'uig',
4328 'uk': 'ukr',
4329 'ur': 'urd',
4330 'uz': 'uzb',
4331 've': 'ven',
4332 'vi': 'vie',
4333 'vo': 'vol',
4334 'wa': 'wln',
4335 'wo': 'wol',
4336 'xh': 'xho',
4337 'yi': 'yid',
4338 'ji': 'yid', # Replaced by yi in 1989 revision
4339 'yo': 'yor',
4340 'za': 'zha',
4341 'zh': 'zho',
4342 'zu': 'zul',
4343 }
4344
4345 @classmethod
4346 def short2long(cls, code):
4347 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4348 return cls._lang_map.get(code[:2])
4349
4350 @classmethod
4351 def long2short(cls, code):
4352 """Convert language code from ISO 639-2/T to ISO 639-1"""
4353 for short_name, long_name in cls._lang_map.items():
4354 if long_name == code:
4355 return short_name
4356
4357
4358 class ISO3166Utils:
4359 # From http://data.okfn.org/data/core/country-list
4360 _country_map = {
4361 'AF': 'Afghanistan',
4362 'AX': 'Åland Islands',
4363 'AL': 'Albania',
4364 'DZ': 'Algeria',
4365 'AS': 'American Samoa',
4366 'AD': 'Andorra',
4367 'AO': 'Angola',
4368 'AI': 'Anguilla',
4369 'AQ': 'Antarctica',
4370 'AG': 'Antigua and Barbuda',
4371 'AR': 'Argentina',
4372 'AM': 'Armenia',
4373 'AW': 'Aruba',
4374 'AU': 'Australia',
4375 'AT': 'Austria',
4376 'AZ': 'Azerbaijan',
4377 'BS': 'Bahamas',
4378 'BH': 'Bahrain',
4379 'BD': 'Bangladesh',
4380 'BB': 'Barbados',
4381 'BY': 'Belarus',
4382 'BE': 'Belgium',
4383 'BZ': 'Belize',
4384 'BJ': 'Benin',
4385 'BM': 'Bermuda',
4386 'BT': 'Bhutan',
4387 'BO': 'Bolivia, Plurinational State of',
4388 'BQ': 'Bonaire, Sint Eustatius and Saba',
4389 'BA': 'Bosnia and Herzegovina',
4390 'BW': 'Botswana',
4391 'BV': 'Bouvet Island',
4392 'BR': 'Brazil',
4393 'IO': 'British Indian Ocean Territory',
4394 'BN': 'Brunei Darussalam',
4395 'BG': 'Bulgaria',
4396 'BF': 'Burkina Faso',
4397 'BI': 'Burundi',
4398 'KH': 'Cambodia',
4399 'CM': 'Cameroon',
4400 'CA': 'Canada',
4401 'CV': 'Cape Verde',
4402 'KY': 'Cayman Islands',
4403 'CF': 'Central African Republic',
4404 'TD': 'Chad',
4405 'CL': 'Chile',
4406 'CN': 'China',
4407 'CX': 'Christmas Island',
4408 'CC': 'Cocos (Keeling) Islands',
4409 'CO': 'Colombia',
4410 'KM': 'Comoros',
4411 'CG': 'Congo',
4412 'CD': 'Congo, the Democratic Republic of the',
4413 'CK': 'Cook Islands',
4414 'CR': 'Costa Rica',
4415 'CI': 'Côte d\'Ivoire',
4416 'HR': 'Croatia',
4417 'CU': 'Cuba',
4418 'CW': 'Curaçao',
4419 'CY': 'Cyprus',
4420 'CZ': 'Czech Republic',
4421 'DK': 'Denmark',
4422 'DJ': 'Djibouti',
4423 'DM': 'Dominica',
4424 'DO': 'Dominican Republic',
4425 'EC': 'Ecuador',
4426 'EG': 'Egypt',
4427 'SV': 'El Salvador',
4428 'GQ': 'Equatorial Guinea',
4429 'ER': 'Eritrea',
4430 'EE': 'Estonia',
4431 'ET': 'Ethiopia',
4432 'FK': 'Falkland Islands (Malvinas)',
4433 'FO': 'Faroe Islands',
4434 'FJ': 'Fiji',
4435 'FI': 'Finland',
4436 'FR': 'France',
4437 'GF': 'French Guiana',
4438 'PF': 'French Polynesia',
4439 'TF': 'French Southern Territories',
4440 'GA': 'Gabon',
4441 'GM': 'Gambia',
4442 'GE': 'Georgia',
4443 'DE': 'Germany',
4444 'GH': 'Ghana',
4445 'GI': 'Gibraltar',
4446 'GR': 'Greece',
4447 'GL': 'Greenland',
4448 'GD': 'Grenada',
4449 'GP': 'Guadeloupe',
4450 'GU': 'Guam',
4451 'GT': 'Guatemala',
4452 'GG': 'Guernsey',
4453 'GN': 'Guinea',
4454 'GW': 'Guinea-Bissau',
4455 'GY': 'Guyana',
4456 'HT': 'Haiti',
4457 'HM': 'Heard Island and McDonald Islands',
4458 'VA': 'Holy See (Vatican City State)',
4459 'HN': 'Honduras',
4460 'HK': 'Hong Kong',
4461 'HU': 'Hungary',
4462 'IS': 'Iceland',
4463 'IN': 'India',
4464 'ID': 'Indonesia',
4465 'IR': 'Iran, Islamic Republic of',
4466 'IQ': 'Iraq',
4467 'IE': 'Ireland',
4468 'IM': 'Isle of Man',
4469 'IL': 'Israel',
4470 'IT': 'Italy',
4471 'JM': 'Jamaica',
4472 'JP': 'Japan',
4473 'JE': 'Jersey',
4474 'JO': 'Jordan',
4475 'KZ': 'Kazakhstan',
4476 'KE': 'Kenya',
4477 'KI': 'Kiribati',
4478 'KP': 'Korea, Democratic People\'s Republic of',
4479 'KR': 'Korea, Republic of',
4480 'KW': 'Kuwait',
4481 'KG': 'Kyrgyzstan',
4482 'LA': 'Lao People\'s Democratic Republic',
4483 'LV': 'Latvia',
4484 'LB': 'Lebanon',
4485 'LS': 'Lesotho',
4486 'LR': 'Liberia',
4487 'LY': 'Libya',
4488 'LI': 'Liechtenstein',
4489 'LT': 'Lithuania',
4490 'LU': 'Luxembourg',
4491 'MO': 'Macao',
4492 'MK': 'Macedonia, the Former Yugoslav Republic of',
4493 'MG': 'Madagascar',
4494 'MW': 'Malawi',
4495 'MY': 'Malaysia',
4496 'MV': 'Maldives',
4497 'ML': 'Mali',
4498 'MT': 'Malta',
4499 'MH': 'Marshall Islands',
4500 'MQ': 'Martinique',
4501 'MR': 'Mauritania',
4502 'MU': 'Mauritius',
4503 'YT': 'Mayotte',
4504 'MX': 'Mexico',
4505 'FM': 'Micronesia, Federated States of',
4506 'MD': 'Moldova, Republic of',
4507 'MC': 'Monaco',
4508 'MN': 'Mongolia',
4509 'ME': 'Montenegro',
4510 'MS': 'Montserrat',
4511 'MA': 'Morocco',
4512 'MZ': 'Mozambique',
4513 'MM': 'Myanmar',
4514 'NA': 'Namibia',
4515 'NR': 'Nauru',
4516 'NP': 'Nepal',
4517 'NL': 'Netherlands',
4518 'NC': 'New Caledonia',
4519 'NZ': 'New Zealand',
4520 'NI': 'Nicaragua',
4521 'NE': 'Niger',
4522 'NG': 'Nigeria',
4523 'NU': 'Niue',
4524 'NF': 'Norfolk Island',
4525 'MP': 'Northern Mariana Islands',
4526 'NO': 'Norway',
4527 'OM': 'Oman',
4528 'PK': 'Pakistan',
4529 'PW': 'Palau',
4530 'PS': 'Palestine, State of',
4531 'PA': 'Panama',
4532 'PG': 'Papua New Guinea',
4533 'PY': 'Paraguay',
4534 'PE': 'Peru',
4535 'PH': 'Philippines',
4536 'PN': 'Pitcairn',
4537 'PL': 'Poland',
4538 'PT': 'Portugal',
4539 'PR': 'Puerto Rico',
4540 'QA': 'Qatar',
4541 'RE': 'Réunion',
4542 'RO': 'Romania',
4543 'RU': 'Russian Federation',
4544 'RW': 'Rwanda',
4545 'BL': 'Saint Barthélemy',
4546 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4547 'KN': 'Saint Kitts and Nevis',
4548 'LC': 'Saint Lucia',
4549 'MF': 'Saint Martin (French part)',
4550 'PM': 'Saint Pierre and Miquelon',
4551 'VC': 'Saint Vincent and the Grenadines',
4552 'WS': 'Samoa',
4553 'SM': 'San Marino',
4554 'ST': 'Sao Tome and Principe',
4555 'SA': 'Saudi Arabia',
4556 'SN': 'Senegal',
4557 'RS': 'Serbia',
4558 'SC': 'Seychelles',
4559 'SL': 'Sierra Leone',
4560 'SG': 'Singapore',
4561 'SX': 'Sint Maarten (Dutch part)',
4562 'SK': 'Slovakia',
4563 'SI': 'Slovenia',
4564 'SB': 'Solomon Islands',
4565 'SO': 'Somalia',
4566 'ZA': 'South Africa',
4567 'GS': 'South Georgia and the South Sandwich Islands',
4568 'SS': 'South Sudan',
4569 'ES': 'Spain',
4570 'LK': 'Sri Lanka',
4571 'SD': 'Sudan',
4572 'SR': 'Suriname',
4573 'SJ': 'Svalbard and Jan Mayen',
4574 'SZ': 'Swaziland',
4575 'SE': 'Sweden',
4576 'CH': 'Switzerland',
4577 'SY': 'Syrian Arab Republic',
4578 'TW': 'Taiwan, Province of China',
4579 'TJ': 'Tajikistan',
4580 'TZ': 'Tanzania, United Republic of',
4581 'TH': 'Thailand',
4582 'TL': 'Timor-Leste',
4583 'TG': 'Togo',
4584 'TK': 'Tokelau',
4585 'TO': 'Tonga',
4586 'TT': 'Trinidad and Tobago',
4587 'TN': 'Tunisia',
4588 'TR': 'Turkey',
4589 'TM': 'Turkmenistan',
4590 'TC': 'Turks and Caicos Islands',
4591 'TV': 'Tuvalu',
4592 'UG': 'Uganda',
4593 'UA': 'Ukraine',
4594 'AE': 'United Arab Emirates',
4595 'GB': 'United Kingdom',
4596 'US': 'United States',
4597 'UM': 'United States Minor Outlying Islands',
4598 'UY': 'Uruguay',
4599 'UZ': 'Uzbekistan',
4600 'VU': 'Vanuatu',
4601 'VE': 'Venezuela, Bolivarian Republic of',
4602 'VN': 'Viet Nam',
4603 'VG': 'Virgin Islands, British',
4604 'VI': 'Virgin Islands, U.S.',
4605 'WF': 'Wallis and Futuna',
4606 'EH': 'Western Sahara',
4607 'YE': 'Yemen',
4608 'ZM': 'Zambia',
4609 'ZW': 'Zimbabwe',
4610 # Not ISO 3166 codes, but used for IP blocks
4611 'AP': 'Asia/Pacific Region',
4612 'EU': 'Europe',
4613 }
4614
4615 @classmethod
4616 def short2full(cls, code):
4617 """Convert an ISO 3166-2 country code to the corresponding full name"""
4618 return cls._country_map.get(code.upper())
4619
4620
4621 class GeoUtils:
4622 # Major IPv4 address blocks per country
4623 _country_ip_map = {
4624 'AD': '46.172.224.0/19',
4625 'AE': '94.200.0.0/13',
4626 'AF': '149.54.0.0/17',
4627 'AG': '209.59.64.0/18',
4628 'AI': '204.14.248.0/21',
4629 'AL': '46.99.0.0/16',
4630 'AM': '46.70.0.0/15',
4631 'AO': '105.168.0.0/13',
4632 'AP': '182.50.184.0/21',
4633 'AQ': '23.154.160.0/24',
4634 'AR': '181.0.0.0/12',
4635 'AS': '202.70.112.0/20',
4636 'AT': '77.116.0.0/14',
4637 'AU': '1.128.0.0/11',
4638 'AW': '181.41.0.0/18',
4639 'AX': '185.217.4.0/22',
4640 'AZ': '5.197.0.0/16',
4641 'BA': '31.176.128.0/17',
4642 'BB': '65.48.128.0/17',
4643 'BD': '114.130.0.0/16',
4644 'BE': '57.0.0.0/8',
4645 'BF': '102.178.0.0/15',
4646 'BG': '95.42.0.0/15',
4647 'BH': '37.131.0.0/17',
4648 'BI': '154.117.192.0/18',
4649 'BJ': '137.255.0.0/16',
4650 'BL': '185.212.72.0/23',
4651 'BM': '196.12.64.0/18',
4652 'BN': '156.31.0.0/16',
4653 'BO': '161.56.0.0/16',
4654 'BQ': '161.0.80.0/20',
4655 'BR': '191.128.0.0/12',
4656 'BS': '24.51.64.0/18',
4657 'BT': '119.2.96.0/19',
4658 'BW': '168.167.0.0/16',
4659 'BY': '178.120.0.0/13',
4660 'BZ': '179.42.192.0/18',
4661 'CA': '99.224.0.0/11',
4662 'CD': '41.243.0.0/16',
4663 'CF': '197.242.176.0/21',
4664 'CG': '160.113.0.0/16',
4665 'CH': '85.0.0.0/13',
4666 'CI': '102.136.0.0/14',
4667 'CK': '202.65.32.0/19',
4668 'CL': '152.172.0.0/14',
4669 'CM': '102.244.0.0/14',
4670 'CN': '36.128.0.0/10',
4671 'CO': '181.240.0.0/12',
4672 'CR': '201.192.0.0/12',
4673 'CU': '152.206.0.0/15',
4674 'CV': '165.90.96.0/19',
4675 'CW': '190.88.128.0/17',
4676 'CY': '31.153.0.0/16',
4677 'CZ': '88.100.0.0/14',
4678 'DE': '53.0.0.0/8',
4679 'DJ': '197.241.0.0/17',
4680 'DK': '87.48.0.0/12',
4681 'DM': '192.243.48.0/20',
4682 'DO': '152.166.0.0/15',
4683 'DZ': '41.96.0.0/12',
4684 'EC': '186.68.0.0/15',
4685 'EE': '90.190.0.0/15',
4686 'EG': '156.160.0.0/11',
4687 'ER': '196.200.96.0/20',
4688 'ES': '88.0.0.0/11',
4689 'ET': '196.188.0.0/14',
4690 'EU': '2.16.0.0/13',
4691 'FI': '91.152.0.0/13',
4692 'FJ': '144.120.0.0/16',
4693 'FK': '80.73.208.0/21',
4694 'FM': '119.252.112.0/20',
4695 'FO': '88.85.32.0/19',
4696 'FR': '90.0.0.0/9',
4697 'GA': '41.158.0.0/15',
4698 'GB': '25.0.0.0/8',
4699 'GD': '74.122.88.0/21',
4700 'GE': '31.146.0.0/16',
4701 'GF': '161.22.64.0/18',
4702 'GG': '62.68.160.0/19',
4703 'GH': '154.160.0.0/12',
4704 'GI': '95.164.0.0/16',
4705 'GL': '88.83.0.0/19',
4706 'GM': '160.182.0.0/15',
4707 'GN': '197.149.192.0/18',
4708 'GP': '104.250.0.0/19',
4709 'GQ': '105.235.224.0/20',
4710 'GR': '94.64.0.0/13',
4711 'GT': '168.234.0.0/16',
4712 'GU': '168.123.0.0/16',
4713 'GW': '197.214.80.0/20',
4714 'GY': '181.41.64.0/18',
4715 'HK': '113.252.0.0/14',
4716 'HN': '181.210.0.0/16',
4717 'HR': '93.136.0.0/13',
4718 'HT': '148.102.128.0/17',
4719 'HU': '84.0.0.0/14',
4720 'ID': '39.192.0.0/10',
4721 'IE': '87.32.0.0/12',
4722 'IL': '79.176.0.0/13',
4723 'IM': '5.62.80.0/20',
4724 'IN': '117.192.0.0/10',
4725 'IO': '203.83.48.0/21',
4726 'IQ': '37.236.0.0/14',
4727 'IR': '2.176.0.0/12',
4728 'IS': '82.221.0.0/16',
4729 'IT': '79.0.0.0/10',
4730 'JE': '87.244.64.0/18',
4731 'JM': '72.27.0.0/17',
4732 'JO': '176.29.0.0/16',
4733 'JP': '133.0.0.0/8',
4734 'KE': '105.48.0.0/12',
4735 'KG': '158.181.128.0/17',
4736 'KH': '36.37.128.0/17',
4737 'KI': '103.25.140.0/22',
4738 'KM': '197.255.224.0/20',
4739 'KN': '198.167.192.0/19',
4740 'KP': '175.45.176.0/22',
4741 'KR': '175.192.0.0/10',
4742 'KW': '37.36.0.0/14',
4743 'KY': '64.96.0.0/15',
4744 'KZ': '2.72.0.0/13',
4745 'LA': '115.84.64.0/18',
4746 'LB': '178.135.0.0/16',
4747 'LC': '24.92.144.0/20',
4748 'LI': '82.117.0.0/19',
4749 'LK': '112.134.0.0/15',
4750 'LR': '102.183.0.0/16',
4751 'LS': '129.232.0.0/17',
4752 'LT': '78.56.0.0/13',
4753 'LU': '188.42.0.0/16',
4754 'LV': '46.109.0.0/16',
4755 'LY': '41.252.0.0/14',
4756 'MA': '105.128.0.0/11',
4757 'MC': '88.209.64.0/18',
4758 'MD': '37.246.0.0/16',
4759 'ME': '178.175.0.0/17',
4760 'MF': '74.112.232.0/21',
4761 'MG': '154.126.0.0/17',
4762 'MH': '117.103.88.0/21',
4763 'MK': '77.28.0.0/15',
4764 'ML': '154.118.128.0/18',
4765 'MM': '37.111.0.0/17',
4766 'MN': '49.0.128.0/17',
4767 'MO': '60.246.0.0/16',
4768 'MP': '202.88.64.0/20',
4769 'MQ': '109.203.224.0/19',
4770 'MR': '41.188.64.0/18',
4771 'MS': '208.90.112.0/22',
4772 'MT': '46.11.0.0/16',
4773 'MU': '105.16.0.0/12',
4774 'MV': '27.114.128.0/18',
4775 'MW': '102.70.0.0/15',
4776 'MX': '187.192.0.0/11',
4777 'MY': '175.136.0.0/13',
4778 'MZ': '197.218.0.0/15',
4779 'NA': '41.182.0.0/16',
4780 'NC': '101.101.0.0/18',
4781 'NE': '197.214.0.0/18',
4782 'NF': '203.17.240.0/22',
4783 'NG': '105.112.0.0/12',
4784 'NI': '186.76.0.0/15',
4785 'NL': '145.96.0.0/11',
4786 'NO': '84.208.0.0/13',
4787 'NP': '36.252.0.0/15',
4788 'NR': '203.98.224.0/19',
4789 'NU': '49.156.48.0/22',
4790 'NZ': '49.224.0.0/14',
4791 'OM': '5.36.0.0/15',
4792 'PA': '186.72.0.0/15',
4793 'PE': '186.160.0.0/14',
4794 'PF': '123.50.64.0/18',
4795 'PG': '124.240.192.0/19',
4796 'PH': '49.144.0.0/13',
4797 'PK': '39.32.0.0/11',
4798 'PL': '83.0.0.0/11',
4799 'PM': '70.36.0.0/20',
4800 'PR': '66.50.0.0/16',
4801 'PS': '188.161.0.0/16',
4802 'PT': '85.240.0.0/13',
4803 'PW': '202.124.224.0/20',
4804 'PY': '181.120.0.0/14',
4805 'QA': '37.210.0.0/15',
4806 'RE': '102.35.0.0/16',
4807 'RO': '79.112.0.0/13',
4808 'RS': '93.86.0.0/15',
4809 'RU': '5.136.0.0/13',
4810 'RW': '41.186.0.0/16',
4811 'SA': '188.48.0.0/13',
4812 'SB': '202.1.160.0/19',
4813 'SC': '154.192.0.0/11',
4814 'SD': '102.120.0.0/13',
4815 'SE': '78.64.0.0/12',
4816 'SG': '8.128.0.0/10',
4817 'SI': '188.196.0.0/14',
4818 'SK': '78.98.0.0/15',
4819 'SL': '102.143.0.0/17',
4820 'SM': '89.186.32.0/19',
4821 'SN': '41.82.0.0/15',
4822 'SO': '154.115.192.0/18',
4823 'SR': '186.179.128.0/17',
4824 'SS': '105.235.208.0/21',
4825 'ST': '197.159.160.0/19',
4826 'SV': '168.243.0.0/16',
4827 'SX': '190.102.0.0/20',
4828 'SY': '5.0.0.0/16',
4829 'SZ': '41.84.224.0/19',
4830 'TC': '65.255.48.0/20',
4831 'TD': '154.68.128.0/19',
4832 'TG': '196.168.0.0/14',
4833 'TH': '171.96.0.0/13',
4834 'TJ': '85.9.128.0/18',
4835 'TK': '27.96.24.0/21',
4836 'TL': '180.189.160.0/20',
4837 'TM': '95.85.96.0/19',
4838 'TN': '197.0.0.0/11',
4839 'TO': '175.176.144.0/21',
4840 'TR': '78.160.0.0/11',
4841 'TT': '186.44.0.0/15',
4842 'TV': '202.2.96.0/19',
4843 'TW': '120.96.0.0/11',
4844 'TZ': '156.156.0.0/14',
4845 'UA': '37.52.0.0/14',
4846 'UG': '102.80.0.0/13',
4847 'US': '6.0.0.0/8',
4848 'UY': '167.56.0.0/13',
4849 'UZ': '84.54.64.0/18',
4850 'VA': '212.77.0.0/19',
4851 'VC': '207.191.240.0/21',
4852 'VE': '186.88.0.0/13',
4853 'VG': '66.81.192.0/20',
4854 'VI': '146.226.0.0/16',
4855 'VN': '14.160.0.0/11',
4856 'VU': '202.80.32.0/20',
4857 'WF': '117.20.32.0/21',
4858 'WS': '202.4.32.0/19',
4859 'YE': '134.35.0.0/16',
4860 'YT': '41.242.116.0/22',
4861 'ZA': '41.0.0.0/11',
4862 'ZM': '102.144.0.0/13',
4863 'ZW': '102.177.192.0/18',
4864 }
4865
4866 @classmethod
4867 def random_ipv4(cls, code_or_block):
4868 if len(code_or_block) == 2:
4869 block = cls._country_ip_map.get(code_or_block.upper())
4870 if not block:
4871 return None
4872 else:
4873 block = code_or_block
4874 addr, preflen = block.split('/')
4875 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4876 addr_max = addr_min | (0xffffffff >> int(preflen))
4877 return str(socket.inet_ntoa(
4878 struct.pack('!L', random.randint(addr_min, addr_max))))
4879
4880
4881 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4882 def __init__(self, proxies=None):
4883 # Set default handlers
4884 for type in ('http', 'https'):
4885 setattr(self, '%s_open' % type,
4886 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4887 meth(r, proxy, type))
4888 urllib.request.ProxyHandler.__init__(self, proxies)
4889
4890 def proxy_open(self, req, proxy, type):
4891 req_proxy = req.headers.get('Ytdl-request-proxy')
4892 if req_proxy is not None:
4893 proxy = req_proxy
4894 del req.headers['Ytdl-request-proxy']
4895
4896 if proxy == '__noproxy__':
4897 return None # No Proxy
4898 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4899 req.add_header('Ytdl-socks-proxy', proxy)
4900 # yt-dlp's http/https handlers do wrapping the socket with socks
4901 return None
4902 return urllib.request.ProxyHandler.proxy_open(
4903 self, req, proxy, type)
4904
4905
4906 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4907 # released into Public Domain
4908 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4909
4910 def long_to_bytes(n, blocksize=0):
4911 """long_to_bytes(n:long, blocksize:int) : string
4912 Convert a long integer to a byte string.
4913
4914 If optional blocksize is given and greater than zero, pad the front of the
4915 byte string with binary zeros so that the length is a multiple of
4916 blocksize.
4917 """
4918 # after much testing, this algorithm was deemed to be the fastest
4919 s = b''
4920 n = int(n)
4921 while n > 0:
4922 s = struct.pack('>I', n & 0xffffffff) + s
4923 n = n >> 32
4924 # strip off leading zeros
4925 for i in range(len(s)):
4926 if s[i] != b'\000'[0]:
4927 break
4928 else:
4929 # only happens when n == 0
4930 s = b'\000'
4931 i = 0
4932 s = s[i:]
4933 # add back some pad bytes. this could be done more efficiently w.r.t. the
4934 # de-padding being done above, but sigh...
4935 if blocksize > 0 and len(s) % blocksize:
4936 s = (blocksize - len(s) % blocksize) * b'\000' + s
4937 return s
4938
4939
4940 def bytes_to_long(s):
4941 """bytes_to_long(string) : long
4942 Convert a byte string to a long integer.
4943
4944 This is (essentially) the inverse of long_to_bytes().
4945 """
4946 acc = 0
4947 length = len(s)
4948 if length % 4:
4949 extra = (4 - length % 4)
4950 s = b'\000' * extra + s
4951 length = length + extra
4952 for i in range(0, length, 4):
4953 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4954 return acc
4955
4956
4957 def ohdave_rsa_encrypt(data, exponent, modulus):
4958 '''
4959 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4960
4961 Input:
4962 data: data to encrypt, bytes-like object
4963 exponent, modulus: parameter e and N of RSA algorithm, both integer
4964 Output: hex string of encrypted data
4965
4966 Limitation: supports one block encryption only
4967 '''
4968
4969 payload = int(binascii.hexlify(data[::-1]), 16)
4970 encrypted = pow(payload, exponent, modulus)
4971 return '%x' % encrypted
4972
4973
4974 def pkcs1pad(data, length):
4975 """
4976 Padding input data with PKCS#1 scheme
4977
4978 @param {int[]} data input data
4979 @param {int} length target length
4980 @returns {int[]} padded data
4981 """
4982 if len(data) > length - 11:
4983 raise ValueError('Input data too long for PKCS#1 padding')
4984
4985 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4986 return [0, 2] + pseudo_random + [0] + data
4987
4988
4989 def _base_n_table(n, table):
4990 if not table and not n:
4991 raise ValueError('Either table or n must be specified')
4992 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4993
4994 if n and n != len(table):
4995 raise ValueError(f'base {n} exceeds table length {len(table)}')
4996 return table
4997
4998
4999 def encode_base_n(num, n=None, table=None):
5000 """Convert given int to a base-n string"""
5001 table = _base_n_table(n, table)
5002 if not num:
5003 return table[0]
5004
5005 result, base = '', len(table)
5006 while num:
5007 result = table[num % base] + result
5008 num = num // base
5009 return result
5010
5011
5012 def decode_base_n(string, n=None, table=None):
5013 """Convert given base-n string to int"""
5014 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5015 result, base = 0, len(table)
5016 for char in string:
5017 result = result * base + table[char]
5018 return result
5019
5020
5021 def decode_packed_codes(code):
5022 mobj = re.search(PACKED_CODES_RE, code)
5023 obfuscated_code, base, count, symbols = mobj.groups()
5024 base = int(base)
5025 count = int(count)
5026 symbols = symbols.split('|')
5027 symbol_table = {}
5028
5029 while count:
5030 count -= 1
5031 base_n_count = encode_base_n(count, base)
5032 symbol_table[base_n_count] = symbols[count] or base_n_count
5033
5034 return re.sub(
5035 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5036 obfuscated_code)
5037
5038
5039 def caesar(s, alphabet, shift):
5040 if shift == 0:
5041 return s
5042 l = len(alphabet)
5043 return ''.join(
5044 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5045 for c in s)
5046
5047
5048 def rot47(s):
5049 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5050
5051
5052 def parse_m3u8_attributes(attrib):
5053 info = {}
5054 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5055 if val.startswith('"'):
5056 val = val[1:-1]
5057 info[key] = val
5058 return info
5059
5060
5061 def urshift(val, n):
5062 return val >> n if val >= 0 else (val + 0x100000000) >> n
5063
5064
5065 def write_xattr(path, key, value):
5066 # Windows: Write xattrs to NTFS Alternate Data Streams:
5067 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5068 if compat_os_name == 'nt':
5069 assert ':' not in key
5070 assert os.path.exists(path)
5071
5072 try:
5073 with open(f'{path}:{key}', 'wb') as f:
5074 f.write(value)
5075 except OSError as e:
5076 raise XAttrMetadataError(e.errno, e.strerror)
5077 return
5078
5079 # UNIX Method 1. Use xattrs/pyxattrs modules
5080
5081 setxattr = None
5082 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5083 # Unicode arguments are not supported in pyxattr until version 0.5.0
5084 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5085 if version_tuple(xattr.__version__) >= (0, 5, 0):
5086 setxattr = xattr.set
5087 elif xattr:
5088 setxattr = xattr.setxattr
5089
5090 if setxattr:
5091 try:
5092 setxattr(path, key, value)
5093 except OSError as e:
5094 raise XAttrMetadataError(e.errno, e.strerror)
5095 return
5096
5097 # UNIX Method 2. Use setfattr/xattr executables
5098 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5099 else 'xattr' if check_executable('xattr', ['-h']) else None)
5100 if not exe:
5101 raise XAttrUnavailableError(
5102 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5103 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5104
5105 value = value.decode()
5106 try:
5107 _, stderr, returncode = Popen.run(
5108 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5109 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5110 except OSError as e:
5111 raise XAttrMetadataError(e.errno, e.strerror)
5112 if returncode:
5113 raise XAttrMetadataError(returncode, stderr)
5114
5115
5116 def random_birthday(year_field, month_field, day_field):
5117 start_date = datetime.date(1950, 1, 1)
5118 end_date = datetime.date(1995, 12, 31)
5119 offset = random.randint(0, (end_date - start_date).days)
5120 random_date = start_date + datetime.timedelta(offset)
5121 return {
5122 year_field: str(random_date.year),
5123 month_field: str(random_date.month),
5124 day_field: str(random_date.day),
5125 }
5126
5127
5128 def find_available_port(interface=''):
5129 try:
5130 with socket.socket() as sock:
5131 sock.bind((interface, 0))
5132 return sock.getsockname()[1]
5133 except OSError:
5134 return None
5135
5136
5137 # Templates for internet shortcut files, which are plain text files.
5138 DOT_URL_LINK_TEMPLATE = '''\
5139 [InternetShortcut]
5140 URL=%(url)s
5141 '''
5142
5143 DOT_WEBLOC_LINK_TEMPLATE = '''\
5144 <?xml version="1.0" encoding="UTF-8"?>
5145 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5146 <plist version="1.0">
5147 <dict>
5148 \t<key>URL</key>
5149 \t<string>%(url)s</string>
5150 </dict>
5151 </plist>
5152 '''
5153
5154 DOT_DESKTOP_LINK_TEMPLATE = '''\
5155 [Desktop Entry]
5156 Encoding=UTF-8
5157 Name=%(filename)s
5158 Type=Link
5159 URL=%(url)s
5160 Icon=text-html
5161 '''
5162
5163 LINK_TEMPLATES = {
5164 'url': DOT_URL_LINK_TEMPLATE,
5165 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5166 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5167 }
5168
5169
5170 def iri_to_uri(iri):
5171 """
5172 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5173
5174 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5175 """
5176
5177 iri_parts = urllib.parse.urlparse(iri)
5178
5179 if '[' in iri_parts.netloc:
5180 raise ValueError('IPv6 URIs are not, yet, supported.')
5181 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5182
5183 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5184
5185 net_location = ''
5186 if iri_parts.username:
5187 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5188 if iri_parts.password is not None:
5189 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5190 net_location += '@'
5191
5192 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5193 # The 'idna' encoding produces ASCII text.
5194 if iri_parts.port is not None and iri_parts.port != 80:
5195 net_location += ':' + str(iri_parts.port)
5196
5197 return urllib.parse.urlunparse(
5198 (iri_parts.scheme,
5199 net_location,
5200
5201 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5202
5203 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5204 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5205
5206 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5207 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5208
5209 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5210
5211 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5212
5213
5214 def to_high_limit_path(path):
5215 if sys.platform in ['win32', 'cygwin']:
5216 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5217 return '\\\\?\\' + os.path.abspath(path)
5218
5219 return path
5220
5221
5222 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5223 val = traversal.traverse_obj(obj, *variadic(field))
5224 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5225 return default
5226 return template % func(val)
5227
5228
5229 def clean_podcast_url(url):
5230 return re.sub(r'''(?x)
5231 (?:
5232 (?:
5233 chtbl\.com/track|
5234 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5235 play\.podtrac\.com
5236 )/[^/]+|
5237 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5238 flex\.acast\.com|
5239 pd(?:
5240 cn\.co| # https://podcorn.com/analytics-prefix/
5241 st\.fm # https://podsights.com/docs/
5242 )/e
5243 )/''', '', url)
5244
5245
5246 _HEX_TABLE = '0123456789abcdef'
5247
5248
5249 def random_uuidv4():
5250 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5251
5252
5253 def make_dir(path, to_screen=None):
5254 try:
5255 dn = os.path.dirname(path)
5256 if dn:
5257 os.makedirs(dn, exist_ok=True)
5258 return True
5259 except OSError as err:
5260 if callable(to_screen) is not None:
5261 to_screen(f'unable to create directory {err}')
5262 return False
5263
5264
5265 def get_executable_path():
5266 from ..update import _get_variant_and_executable_path
5267
5268 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5269
5270
5271 def get_user_config_dirs(package_name):
5272 # .config (e.g. ~/.config/package_name)
5273 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5274 yield os.path.join(xdg_config_home, package_name)
5275
5276 # appdata (%APPDATA%/package_name)
5277 appdata_dir = os.getenv('appdata')
5278 if appdata_dir:
5279 yield os.path.join(appdata_dir, package_name)
5280
5281 # home (~/.package_name)
5282 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5283
5284
5285 def get_system_config_dirs(package_name):
5286 # /etc/package_name
5287 yield os.path.join('/etc', package_name)
5288
5289
5290 def time_seconds(**kwargs):
5291 """
5292 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5293 """
5294 return time.time() + datetime.timedelta(**kwargs).total_seconds()
5295
5296
5297 # create a JSON Web Signature (jws) with HS256 algorithm
5298 # the resulting format is in JWS Compact Serialization
5299 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5300 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5301 def jwt_encode_hs256(payload_data, key, headers={}):
5302 header_data = {
5303 'alg': 'HS256',
5304 'typ': 'JWT',
5305 }
5306 if headers:
5307 header_data.update(headers)
5308 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5309 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5310 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5311 signature_b64 = base64.b64encode(h.digest())
5312 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5313 return token
5314
5315
5316 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5317 def jwt_decode_hs256(jwt):
5318 header_b64, payload_b64, signature_b64 = jwt.split('.')
5319 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5320 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5321 return payload_data
5322
5323
5324 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5325
5326
5327 @functools.cache
5328 def supports_terminal_sequences(stream):
5329 if compat_os_name == 'nt':
5330 if not WINDOWS_VT_MODE:
5331 return False
5332 elif not os.getenv('TERM'):
5333 return False
5334 try:
5335 return stream.isatty()
5336 except BaseException:
5337 return False
5338
5339
5340 def windows_enable_vt_mode():
5341 """Ref: https://bugs.python.org/issue30075 """
5342 if get_windows_version() < (10, 0, 10586):
5343 return
5344
5345 import ctypes
5346 import ctypes.wintypes
5347 import msvcrt
5348
5349 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5350
5351 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5352 handle = os.open('CONOUT$', os.O_RDWR)
5353 try:
5354 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5355 dw_original_mode = ctypes.wintypes.DWORD()
5356 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5357 if not success:
5358 raise Exception('GetConsoleMode failed')
5359
5360 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5361 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5362 if not success:
5363 raise Exception('SetConsoleMode failed')
5364 finally:
5365 os.close(handle)
5366
5367 global WINDOWS_VT_MODE
5368 WINDOWS_VT_MODE = True
5369 supports_terminal_sequences.cache_clear()
5370
5371
5372 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5373
5374
5375 def remove_terminal_sequences(string):
5376 return _terminal_sequences_re.sub('', string)
5377
5378
5379 def number_of_digits(number):
5380 return len('%d' % number)
5381
5382
5383 def join_nonempty(*values, delim='-', from_dict=None):
5384 if from_dict is not None:
5385 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5386 return delim.join(map(str, filter(None, values)))
5387
5388
5389 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5390 """
5391 Find the largest format dimensions in terms of video width and, for each thumbnail:
5392 * Modify the URL: Match the width with the provided regex and replace with the former width
5393 * Update dimensions
5394
5395 This function is useful with video services that scale the provided thumbnails on demand
5396 """
5397 _keys = ('width', 'height')
5398 max_dimensions = max(
5399 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5400 default=(0, 0))
5401 if not max_dimensions[0]:
5402 return thumbnails
5403 return [
5404 merge_dicts(
5405 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5406 dict(zip(_keys, max_dimensions)), thumbnail)
5407 for thumbnail in thumbnails
5408 ]
5409
5410
5411 def parse_http_range(range):
5412 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5413 if not range:
5414 return None, None, None
5415 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5416 if not crg:
5417 return None, None, None
5418 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5419
5420
5421 def read_stdin(what):
5422 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5423 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5424 return sys.stdin
5425
5426
5427 def determine_file_encoding(data):
5428 """
5429 Detect the text encoding used
5430 @returns (encoding, bytes to skip)
5431 """
5432
5433 # BOM marks are given priority over declarations
5434 for bom, enc in BOMS:
5435 if data.startswith(bom):
5436 return enc, len(bom)
5437
5438 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5439 # We ignore the endianness to get a good enough match
5440 data = data.replace(b'\0', b'')
5441 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5442 return mobj.group(1).decode() if mobj else None, 0
5443
5444
5445 class Config:
5446 own_args = None
5447 parsed_args = None
5448 filename = None
5449 __initialized = False
5450
5451 def __init__(self, parser, label=None):
5452 self.parser, self.label = parser, label
5453 self._loaded_paths, self.configs = set(), []
5454
5455 def init(self, args=None, filename=None):
5456 assert not self.__initialized
5457 self.own_args, self.filename = args, filename
5458 return self.load_configs()
5459
5460 def load_configs(self):
5461 directory = ''
5462 if self.filename:
5463 location = os.path.realpath(self.filename)
5464 directory = os.path.dirname(location)
5465 if location in self._loaded_paths:
5466 return False
5467 self._loaded_paths.add(location)
5468
5469 self.__initialized = True
5470 opts, _ = self.parser.parse_known_args(self.own_args)
5471 self.parsed_args = self.own_args
5472 for location in opts.config_locations or []:
5473 if location == '-':
5474 if location in self._loaded_paths:
5475 continue
5476 self._loaded_paths.add(location)
5477 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5478 continue
5479 location = os.path.join(directory, expand_path(location))
5480 if os.path.isdir(location):
5481 location = os.path.join(location, 'yt-dlp.conf')
5482 if not os.path.exists(location):
5483 self.parser.error(f'config location {location} does not exist')
5484 self.append_config(self.read_file(location), location)
5485 return True
5486
5487 def __str__(self):
5488 label = join_nonempty(
5489 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5490 delim=' ')
5491 return join_nonempty(
5492 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5493 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5494 delim='\n')
5495
5496 @staticmethod
5497 def read_file(filename, default=[]):
5498 try:
5499 optionf = open(filename, 'rb')
5500 except OSError:
5501 return default # silently skip if file is not present
5502 try:
5503 enc, skip = determine_file_encoding(optionf.read(512))
5504 optionf.seek(skip, io.SEEK_SET)
5505 except OSError:
5506 enc = None # silently skip read errors
5507 try:
5508 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5509 contents = optionf.read().decode(enc or preferredencoding())
5510 res = shlex.split(contents, comments=True)
5511 except Exception as err:
5512 raise ValueError(f'Unable to parse "{filename}": {err}')
5513 finally:
5514 optionf.close()
5515 return res
5516
5517 @staticmethod
5518 def hide_login_info(opts):
5519 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5520 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5521
5522 def _scrub_eq(o):
5523 m = eqre.match(o)
5524 if m:
5525 return m.group('key') + '=PRIVATE'
5526 else:
5527 return o
5528
5529 opts = list(map(_scrub_eq, opts))
5530 for idx, opt in enumerate(opts):
5531 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5532 opts[idx + 1] = 'PRIVATE'
5533 return opts
5534
5535 def append_config(self, *args, label=None):
5536 config = type(self)(self.parser, label)
5537 config._loaded_paths = self._loaded_paths
5538 if config.init(*args):
5539 self.configs.append(config)
5540
5541 @property
5542 def all_args(self):
5543 for config in reversed(self.configs):
5544 yield from config.all_args
5545 yield from self.parsed_args or []
5546
5547 def parse_known_args(self, **kwargs):
5548 return self.parser.parse_known_args(self.all_args, **kwargs)
5549
5550 def parse_args(self):
5551 return self.parser.parse_args(self.all_args)
5552
5553
5554 class WebSocketsWrapper:
5555 """Wraps websockets module to use in non-async scopes"""
5556 pool = None
5557
5558 def __init__(self, url, headers=None, connect=True):
5559 self.loop = asyncio.new_event_loop()
5560 # XXX: "loop" is deprecated
5561 self.conn = websockets.connect(
5562 url, extra_headers=headers, ping_interval=None,
5563 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5564 if connect:
5565 self.__enter__()
5566 atexit.register(self.__exit__, None, None, None)
5567
5568 def __enter__(self):
5569 if not self.pool:
5570 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5571 return self
5572
5573 def send(self, *args):
5574 self.run_with_loop(self.pool.send(*args), self.loop)
5575
5576 def recv(self, *args):
5577 return self.run_with_loop(self.pool.recv(*args), self.loop)
5578
5579 def __exit__(self, type, value, traceback):
5580 try:
5581 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5582 finally:
5583 self.loop.close()
5584 self._cancel_all_tasks(self.loop)
5585
5586 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5587 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5588 @staticmethod
5589 def run_with_loop(main, loop):
5590 if not asyncio.iscoroutine(main):
5591 raise ValueError(f'a coroutine was expected, got {main!r}')
5592
5593 try:
5594 return loop.run_until_complete(main)
5595 finally:
5596 loop.run_until_complete(loop.shutdown_asyncgens())
5597 if hasattr(loop, 'shutdown_default_executor'):
5598 loop.run_until_complete(loop.shutdown_default_executor())
5599
5600 @staticmethod
5601 def _cancel_all_tasks(loop):
5602 to_cancel = asyncio.all_tasks(loop)
5603
5604 if not to_cancel:
5605 return
5606
5607 for task in to_cancel:
5608 task.cancel()
5609
5610 # XXX: "loop" is removed in python 3.10+
5611 loop.run_until_complete(
5612 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5613
5614 for task in to_cancel:
5615 if task.cancelled():
5616 continue
5617 if task.exception() is not None:
5618 loop.call_exception_handler({
5619 'message': 'unhandled exception during asyncio.run() shutdown',
5620 'exception': task.exception(),
5621 'task': task,
5622 })
5623
5624
5625 def merge_headers(*dicts):
5626 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5627 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5628
5629
5630 def cached_method(f):
5631 """Cache a method"""
5632 signature = inspect.signature(f)
5633
5634 @functools.wraps(f)
5635 def wrapper(self, *args, **kwargs):
5636 bound_args = signature.bind(self, *args, **kwargs)
5637 bound_args.apply_defaults()
5638 key = tuple(bound_args.arguments.values())[1:]
5639
5640 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5641 if key not in cache:
5642 cache[key] = f(self, *args, **kwargs)
5643 return cache[key]
5644 return wrapper
5645
5646
5647 class classproperty:
5648 """property access for class methods with optional caching"""
5649 def __new__(cls, func=None, *args, **kwargs):
5650 if not func:
5651 return functools.partial(cls, *args, **kwargs)
5652 return super().__new__(cls)
5653
5654 def __init__(self, func, *, cache=False):
5655 functools.update_wrapper(self, func)
5656 self.func = func
5657 self._cache = {} if cache else None
5658
5659 def __get__(self, _, cls):
5660 if self._cache is None:
5661 return self.func(cls)
5662 elif cls not in self._cache:
5663 self._cache[cls] = self.func(cls)
5664 return self._cache[cls]
5665
5666
5667 class function_with_repr:
5668 def __init__(self, func, repr_=None):
5669 functools.update_wrapper(self, func)
5670 self.func, self.__repr = func, repr_
5671
5672 def __call__(self, *args, **kwargs):
5673 return self.func(*args, **kwargs)
5674
5675 def __repr__(self):
5676 if self.__repr:
5677 return self.__repr
5678 return f'{self.func.__module__}.{self.func.__qualname__}'
5679
5680
5681 class Namespace(types.SimpleNamespace):
5682 """Immutable namespace"""
5683
5684 def __iter__(self):
5685 return iter(self.__dict__.values())
5686
5687 @property
5688 def items_(self):
5689 return self.__dict__.items()
5690
5691
5692 MEDIA_EXTENSIONS = Namespace(
5693 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5694 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5695 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5696 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5697 thumbnails=('jpg', 'png', 'webp'),
5698 storyboards=('mhtml', ),
5699 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5700 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5701 )
5702 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5703 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5704
5705 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5706
5707
5708 class RetryManager:
5709 """Usage:
5710 for retry in RetryManager(...):
5711 try:
5712 ...
5713 except SomeException as err:
5714 retry.error = err
5715 continue
5716 """
5717 attempt, _error = 0, None
5718
5719 def __init__(self, _retries, _error_callback, **kwargs):
5720 self.retries = _retries or 0
5721 self.error_callback = functools.partial(_error_callback, **kwargs)
5722
5723 def _should_retry(self):
5724 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5725
5726 @property
5727 def error(self):
5728 if self._error is NO_DEFAULT:
5729 return None
5730 return self._error
5731
5732 @error.setter
5733 def error(self, value):
5734 self._error = value
5735
5736 def __iter__(self):
5737 while self._should_retry():
5738 self.error = NO_DEFAULT
5739 self.attempt += 1
5740 yield self
5741 if self.error:
5742 self.error_callback(self.error, self.attempt, self.retries)
5743
5744 @staticmethod
5745 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5746 """Utility function for reporting retries"""
5747 if count > retries:
5748 if error:
5749 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5750 raise e
5751
5752 if not count:
5753 return warn(e)
5754 elif isinstance(e, ExtractorError):
5755 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5756 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5757
5758 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5759 if delay:
5760 info(f'Sleeping {delay:.2f} seconds ...')
5761 time.sleep(delay)
5762
5763
5764 def make_archive_id(ie, video_id):
5765 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5766 return f'{ie_key.lower()} {video_id}'
5767
5768
5769 def truncate_string(s, left, right=0):
5770 assert left > 3 and right >= 0
5771 if s is None or len(s) <= left + right:
5772 return s
5773 return f'{s[:left-3]}...{s[-right:] if right else ""}'
5774
5775
5776 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5777 assert 'all' in alias_dict, '"all" alias is required'
5778 requested = list(start or [])
5779 for val in options:
5780 discard = val.startswith('-')
5781 if discard:
5782 val = val[1:]
5783
5784 if val in alias_dict:
5785 val = alias_dict[val] if not discard else [
5786 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5787 # NB: Do not allow regex in aliases for performance
5788 requested = orderedSet_from_options(val, alias_dict, start=requested)
5789 continue
5790
5791 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5792 else [val] if val in alias_dict['all'] else None)
5793 if current is None:
5794 raise ValueError(val)
5795
5796 if discard:
5797 for item in current:
5798 while item in requested:
5799 requested.remove(item)
5800 else:
5801 requested.extend(current)
5802
5803 return orderedSet(requested)
5804
5805
5806 class FormatSorter:
5807 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5808
5809 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5810 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5811 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5812 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5813 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5814 'fps', 'fs_approx', 'source', 'id')
5815
5816 settings = {
5817 'vcodec': {'type': 'ordered', 'regex': True,
5818 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5819 'acodec': {'type': 'ordered', 'regex': True,
5820 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5821 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5822 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5823 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5824 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5825 'vext': {'type': 'ordered', 'field': 'video_ext',
5826 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5827 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5828 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5829 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5830 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5831 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5832 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5833 'field': ('vcodec', 'acodec'),
5834 'function': lambda it: int(any(v != 'none' for v in it))},
5835 'ie_pref': {'priority': True, 'type': 'extractor'},
5836 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5837 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5838 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5839 'quality': {'convert': 'float', 'default': -1},
5840 'filesize': {'convert': 'bytes'},
5841 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5842 'id': {'convert': 'string', 'field': 'format_id'},
5843 'height': {'convert': 'float_none'},
5844 'width': {'convert': 'float_none'},
5845 'fps': {'convert': 'float_none'},
5846 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5847 'tbr': {'convert': 'float_none'},
5848 'vbr': {'convert': 'float_none'},
5849 'abr': {'convert': 'float_none'},
5850 'asr': {'convert': 'float_none'},
5851 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5852
5853 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5854 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
5855 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
5856 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5857 'res': {'type': 'multiple', 'field': ('height', 'width'),
5858 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5859
5860 # Actual field names
5861 'format_id': {'type': 'alias', 'field': 'id'},
5862 'preference': {'type': 'alias', 'field': 'ie_pref'},
5863 'language_preference': {'type': 'alias', 'field': 'lang'},
5864 'source_preference': {'type': 'alias', 'field': 'source'},
5865 'protocol': {'type': 'alias', 'field': 'proto'},
5866 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5867 'audio_channels': {'type': 'alias', 'field': 'channels'},
5868
5869 # Deprecated
5870 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5871 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5872 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5873 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5874 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5875 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5876 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5877 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5878 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5879 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5880 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5881 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5882 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5883 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5884 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5885 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5886 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5887 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5888 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5889 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5890 }
5891
5892 def __init__(self, ydl, field_preference):
5893 self.ydl = ydl
5894 self._order = []
5895 self.evaluate_params(self.ydl.params, field_preference)
5896 if ydl.params.get('verbose'):
5897 self.print_verbose_info(self.ydl.write_debug)
5898
5899 def _get_field_setting(self, field, key):
5900 if field not in self.settings:
5901 if key in ('forced', 'priority'):
5902 return False
5903 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5904 'deprecated and may be removed in a future version')
5905 self.settings[field] = {}
5906 propObj = self.settings[field]
5907 if key not in propObj:
5908 type = propObj.get('type')
5909 if key == 'field':
5910 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5911 elif key == 'convert':
5912 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5913 else:
5914 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5915 propObj[key] = default
5916 return propObj[key]
5917
5918 def _resolve_field_value(self, field, value, convertNone=False):
5919 if value is None:
5920 if not convertNone:
5921 return None
5922 else:
5923 value = value.lower()
5924 conversion = self._get_field_setting(field, 'convert')
5925 if conversion == 'ignore':
5926 return None
5927 if conversion == 'string':
5928 return value
5929 elif conversion == 'float_none':
5930 return float_or_none(value)
5931 elif conversion == 'bytes':
5932 return parse_bytes(value)
5933 elif conversion == 'order':
5934 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5935 use_regex = self._get_field_setting(field, 'regex')
5936 list_length = len(order_list)
5937 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5938 if use_regex and value is not None:
5939 for i, regex in enumerate(order_list):
5940 if regex and re.match(regex, value):
5941 return list_length - i
5942 return list_length - empty_pos # not in list
5943 else: # not regex or value = None
5944 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5945 else:
5946 if value.isnumeric():
5947 return float(value)
5948 else:
5949 self.settings[field]['convert'] = 'string'
5950 return value
5951
5952 def evaluate_params(self, params, sort_extractor):
5953 self._use_free_order = params.get('prefer_free_formats', False)
5954 self._sort_user = params.get('format_sort', [])
5955 self._sort_extractor = sort_extractor
5956
5957 def add_item(field, reverse, closest, limit_text):
5958 field = field.lower()
5959 if field in self._order:
5960 return
5961 self._order.append(field)
5962 limit = self._resolve_field_value(field, limit_text)
5963 data = {
5964 'reverse': reverse,
5965 'closest': False if limit is None else closest,
5966 'limit_text': limit_text,
5967 'limit': limit}
5968 if field in self.settings:
5969 self.settings[field].update(data)
5970 else:
5971 self.settings[field] = data
5972
5973 sort_list = (
5974 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5975 + (tuple() if params.get('format_sort_force', False)
5976 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5977 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5978
5979 for item in sort_list:
5980 match = re.match(self.regex, item)
5981 if match is None:
5982 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5983 field = match.group('field')
5984 if field is None:
5985 continue
5986 if self._get_field_setting(field, 'type') == 'alias':
5987 alias, field = field, self._get_field_setting(field, 'field')
5988 if self._get_field_setting(alias, 'deprecated'):
5989 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5990 f'be removed in a future version. Please use {field} instead')
5991 reverse = match.group('reverse') is not None
5992 closest = match.group('separator') == '~'
5993 limit_text = match.group('limit')
5994
5995 has_limit = limit_text is not None
5996 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5997 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5998
5999 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6000 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6001 limit_count = len(limits)
6002 for (i, f) in enumerate(fields):
6003 add_item(f, reverse, closest,
6004 limits[i] if i < limit_count
6005 else limits[0] if has_limit and not has_multiple_limits
6006 else None)
6007
6008 def print_verbose_info(self, write_debug):
6009 if self._sort_user:
6010 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6011 if self._sort_extractor:
6012 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6013 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6014 '+' if self._get_field_setting(field, 'reverse') else '', field,
6015 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6016 self._get_field_setting(field, 'limit_text'),
6017 self._get_field_setting(field, 'limit'))
6018 if self._get_field_setting(field, 'limit_text') is not None else '')
6019 for field in self._order if self._get_field_setting(field, 'visible')]))
6020
6021 def _calculate_field_preference_from_value(self, format, field, type, value):
6022 reverse = self._get_field_setting(field, 'reverse')
6023 closest = self._get_field_setting(field, 'closest')
6024 limit = self._get_field_setting(field, 'limit')
6025
6026 if type == 'extractor':
6027 maximum = self._get_field_setting(field, 'max')
6028 if value is None or (maximum is not None and value >= maximum):
6029 value = -1
6030 elif type == 'boolean':
6031 in_list = self._get_field_setting(field, 'in_list')
6032 not_in_list = self._get_field_setting(field, 'not_in_list')
6033 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6034 elif type == 'ordered':
6035 value = self._resolve_field_value(field, value, True)
6036
6037 # try to convert to number
6038 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6039 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6040 if is_num:
6041 value = val_num
6042
6043 return ((-10, 0) if value is None
6044 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6045 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6046 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6047 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6048 else (-1, value, 0))
6049
6050 def _calculate_field_preference(self, format, field):
6051 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6052 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6053 if type == 'multiple':
6054 type = 'field' # Only 'field' is allowed in multiple for now
6055 actual_fields = self._get_field_setting(field, 'field')
6056
6057 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6058 else:
6059 value = get_value(field)
6060 return self._calculate_field_preference_from_value(format, field, type, value)
6061
6062 def calculate_preference(self, format):
6063 # Determine missing protocol
6064 if not format.get('protocol'):
6065 format['protocol'] = determine_protocol(format)
6066
6067 # Determine missing ext
6068 if not format.get('ext') and 'url' in format:
6069 format['ext'] = determine_ext(format['url'])
6070 if format.get('vcodec') == 'none':
6071 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6072 format['video_ext'] = 'none'
6073 else:
6074 format['video_ext'] = format['ext']
6075 format['audio_ext'] = 'none'
6076 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6077 # format['preference'] = -1000
6078
6079 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6080 # HEVC-over-FLV is out-of-spec by FLV's original spec
6081 # ref. https://trac.ffmpeg.org/ticket/6389
6082 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6083 format['preference'] = -100
6084
6085 # Determine missing bitrates
6086 if format.get('tbr') is None:
6087 if format.get('vbr') is not None and format.get('abr') is not None:
6088 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6089 else:
6090 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6091 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6092 if format.get('acodec') != 'none' and format.get('abr') is None:
6093 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6094
6095 return tuple(self._calculate_field_preference(format, field) for field in self._order)