]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils/_utils.py
[utils] Add temporary shim for logging
[yt-dlp.git] / yt_dlp / utils / _utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import html.entities
18 import html.parser
19 import http.client
20 import http.cookiejar
21 import inspect
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import mimetypes
28 import netrc
29 import operator
30 import os
31 import platform
32 import random
33 import re
34 import shlex
35 import socket
36 import ssl
37 import struct
38 import subprocess
39 import sys
40 import tempfile
41 import time
42 import traceback
43 import types
44 import unicodedata
45 import urllib.error
46 import urllib.parse
47 import urllib.request
48 import xml.etree.ElementTree
49 import zlib
50
51 from . import traversal
52
53 from ..compat import functools # isort: split
54 from ..compat import (
55 compat_etree_fromstring,
56 compat_expanduser,
57 compat_HTMLParseError,
58 compat_os_name,
59 compat_shlex_quote,
60 )
61 from ..dependencies import brotli, certifi, websockets, xattr
62 from ..socks import ProxyType, sockssocket
63
64 __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
65
66 # This is not clearly defined otherwise
67 compiled_regex_type = type(re.compile(''))
68
69
70 def random_user_agent():
71 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
72 _CHROME_VERSIONS = (
73 '90.0.4430.212',
74 '90.0.4430.24',
75 '90.0.4430.70',
76 '90.0.4430.72',
77 '90.0.4430.85',
78 '90.0.4430.93',
79 '91.0.4472.101',
80 '91.0.4472.106',
81 '91.0.4472.114',
82 '91.0.4472.124',
83 '91.0.4472.164',
84 '91.0.4472.19',
85 '91.0.4472.77',
86 '92.0.4515.107',
87 '92.0.4515.115',
88 '92.0.4515.131',
89 '92.0.4515.159',
90 '92.0.4515.43',
91 '93.0.4556.0',
92 '93.0.4577.15',
93 '93.0.4577.63',
94 '93.0.4577.82',
95 '94.0.4606.41',
96 '94.0.4606.54',
97 '94.0.4606.61',
98 '94.0.4606.71',
99 '94.0.4606.81',
100 '94.0.4606.85',
101 '95.0.4638.17',
102 '95.0.4638.50',
103 '95.0.4638.54',
104 '95.0.4638.69',
105 '95.0.4638.74',
106 '96.0.4664.18',
107 '96.0.4664.45',
108 '96.0.4664.55',
109 '96.0.4664.93',
110 '97.0.4692.20',
111 )
112 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
113
114
115 SUPPORTED_ENCODINGS = [
116 'gzip', 'deflate'
117 ]
118 if brotli:
119 SUPPORTED_ENCODINGS.append('br')
120
121 std_headers = {
122 'User-Agent': random_user_agent(),
123 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
124 'Accept-Language': 'en-us,en;q=0.5',
125 'Sec-Fetch-Mode': 'navigate',
126 }
127
128
129 USER_AGENTS = {
130 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
131 }
132
133
134 class NO_DEFAULT:
135 pass
136
137
138 def IDENTITY(x):
139 return x
140
141
142 ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
146 MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
151 # these follow the genitive grammatical case (dopełniacz)
152 # some websites might be using nominative, which will require another month list
153 # https://en.wikibooks.org/wiki/Polish/Noun_cases
154 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
155 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
156 }
157
158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
159 TIMEZONE_NAMES = {
160 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
161 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
162 'EST': -5, 'EDT': -4, # Eastern
163 'CST': -6, 'CDT': -5, # Central
164 'MST': -7, 'MDT': -6, # Mountain
165 'PST': -8, 'PDT': -7 # Pacific
166 }
167
168 # needed for sanitizing filenames in restricted mode
169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
170 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
171 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
172
173 DATE_FORMATS = (
174 '%d %B %Y',
175 '%d %b %Y',
176 '%B %d %Y',
177 '%B %dst %Y',
178 '%B %dnd %Y',
179 '%B %drd %Y',
180 '%B %dth %Y',
181 '%b %d %Y',
182 '%b %dst %Y',
183 '%b %dnd %Y',
184 '%b %drd %Y',
185 '%b %dth %Y',
186 '%b %dst %Y %I:%M',
187 '%b %dnd %Y %I:%M',
188 '%b %drd %Y %I:%M',
189 '%b %dth %Y %I:%M',
190 '%Y %m %d',
191 '%Y-%m-%d',
192 '%Y.%m.%d.',
193 '%Y/%m/%d',
194 '%Y/%m/%d %H:%M',
195 '%Y/%m/%d %H:%M:%S',
196 '%Y%m%d%H%M',
197 '%Y%m%d%H%M%S',
198 '%Y%m%d',
199 '%Y-%m-%d %H:%M',
200 '%Y-%m-%d %H:%M:%S',
201 '%Y-%m-%d %H:%M:%S.%f',
202 '%Y-%m-%d %H:%M:%S:%f',
203 '%d.%m.%Y %H:%M',
204 '%d.%m.%Y %H.%M',
205 '%Y-%m-%dT%H:%M:%SZ',
206 '%Y-%m-%dT%H:%M:%S.%fZ',
207 '%Y-%m-%dT%H:%M:%S.%f0Z',
208 '%Y-%m-%dT%H:%M:%S',
209 '%Y-%m-%dT%H:%M:%S.%f',
210 '%Y-%m-%dT%H:%M',
211 '%b %d %Y at %H:%M',
212 '%b %d %Y at %H:%M:%S',
213 '%B %d %Y at %H:%M',
214 '%B %d %Y at %H:%M:%S',
215 '%H:%M %d-%b-%Y',
216 )
217
218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
219 DATE_FORMATS_DAY_FIRST.extend([
220 '%d-%m-%Y',
221 '%d.%m.%Y',
222 '%d.%m.%y',
223 '%d/%m/%Y',
224 '%d/%m/%y',
225 '%d/%m/%Y %H:%M:%S',
226 '%d-%m-%Y %H:%M',
227 '%H:%M %d/%m/%Y',
228 ])
229
230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
231 DATE_FORMATS_MONTH_FIRST.extend([
232 '%m-%d-%Y',
233 '%m.%d.%Y',
234 '%m/%d/%Y',
235 '%m/%d/%y',
236 '%m/%d/%Y %H:%M:%S',
237 ])
238
239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
241
242 NUMBER_RE = r'\d+(?:\.\d+)?'
243
244
245 @functools.cache
246 def preferredencoding():
247 """Get preferred encoding.
248
249 Returns the best encoding scheme for the system, based on
250 locale.getpreferredencoding() and some further tweaks.
251 """
252 try:
253 pref = locale.getpreferredencoding()
254 'TEST'.encode(pref)
255 except Exception:
256 pref = 'UTF-8'
257
258 return pref
259
260
261 def write_json_file(obj, fn):
262 """ Encode obj as JSON and write it to fn, atomically if possible """
263
264 tf = tempfile.NamedTemporaryFile(
265 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
266 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
267
268 try:
269 with tf:
270 json.dump(obj, tf, ensure_ascii=False)
271 if sys.platform == 'win32':
272 # Need to remove existing file on Windows, else os.rename raises
273 # WindowsError or FileExistsError.
274 with contextlib.suppress(OSError):
275 os.unlink(fn)
276 with contextlib.suppress(OSError):
277 mask = os.umask(0)
278 os.umask(mask)
279 os.chmod(tf.name, 0o666 & ~mask)
280 os.rename(tf.name, fn)
281 except Exception:
282 with contextlib.suppress(OSError):
283 os.remove(tf.name)
284 raise
285
286
287 def find_xpath_attr(node, xpath, key, val=None):
288 """ Find the xpath xpath[@key=val] """
289 assert re.match(r'^[a-zA-Z_-]+$', key)
290 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
291 return node.find(expr)
292
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295
296
297 def xpath_with_ns(path, ns_map):
298 components = [c.split(':') for c in path.split('/')]
299 replaced = []
300 for c in components:
301 if len(c) == 1:
302 replaced.append(c[0])
303 else:
304 ns, tag = c
305 replaced.append('{%s}%s' % (ns_map[ns], tag))
306 return '/'.join(replaced)
307
308
309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
310 def _find_xpath(xpath):
311 return node.find(xpath)
312
313 if isinstance(xpath, str):
314 n = _find_xpath(xpath)
315 else:
316 for xp in xpath:
317 n = _find_xpath(xp)
318 if n is not None:
319 break
320
321 if n is None:
322 if default is not NO_DEFAULT:
323 return default
324 elif fatal:
325 name = xpath if name is None else name
326 raise ExtractorError('Could not find XML element %s' % name)
327 else:
328 return None
329 return n
330
331
332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
333 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
334 if n is None or n == default:
335 return n
336 if n.text is None:
337 if default is not NO_DEFAULT:
338 return default
339 elif fatal:
340 name = xpath if name is None else name
341 raise ExtractorError('Could not find XML element\'s text %s' % name)
342 else:
343 return None
344 return n.text
345
346
347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
348 n = find_xpath_attr(node, xpath, key)
349 if n is None:
350 if default is not NO_DEFAULT:
351 return default
352 elif fatal:
353 name = f'{xpath}[@{key}]' if name is None else name
354 raise ExtractorError('Could not find XML attribute %s' % name)
355 else:
356 return None
357 return n.attrib[key]
358
359
360 def get_element_by_id(id, html, **kwargs):
361 """Return the content of the tag with the specified ID in the passed HTML document"""
362 return get_element_by_attribute('id', id, html, **kwargs)
363
364
365 def get_element_html_by_id(id, html, **kwargs):
366 """Return the html of the tag with the specified ID in the passed HTML document"""
367 return get_element_html_by_attribute('id', id, html, **kwargs)
368
369
370 def get_element_by_class(class_name, html):
371 """Return the content of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
376 def get_element_html_by_class(class_name, html):
377 """Return the html of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_html_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
382 def get_element_by_attribute(attribute, value, html, **kwargs):
383 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
384 return retval[0] if retval else None
385
386
387 def get_element_html_by_attribute(attribute, value, html, **kargs):
388 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
389 return retval[0] if retval else None
390
391
392 def get_elements_by_class(class_name, html, **kargs):
393 """Return the content of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_by_attribute(
395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
396 html, escape_value=False)
397
398
399 def get_elements_html_by_class(class_name, html):
400 """Return the html of all tags with the specified class in the passed HTML document as a list"""
401 return get_elements_html_by_attribute(
402 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
403 html, escape_value=False)
404
405
406 def get_elements_by_attribute(*args, **kwargs):
407 """Return the content of the tag with the specified attribute in the passed HTML document"""
408 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411 def get_elements_html_by_attribute(*args, **kwargs):
412 """Return the html of the tag with the specified attribute in the passed HTML document"""
413 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
417 """
418 Return the text (content) and the html (whole) of the tag with the specified
419 attribute in the passed HTML document
420 """
421 if not value:
422 return
423
424 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
425
426 value = re.escape(value) if escape_value else value
427
428 partial_element_re = rf'''(?x)
429 <(?P<tag>{tag})
430 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
431 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
432 '''
433
434 for m in re.finditer(partial_element_re, html):
435 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
436
437 yield (
438 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
439 whole
440 )
441
442
443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
444 """
445 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
446 closing tag for the first opening tag it has encountered, and can be used
447 as a context manager
448 """
449
450 class HTMLBreakOnClosingTagException(Exception):
451 pass
452
453 def __init__(self):
454 self.tagstack = collections.deque()
455 html.parser.HTMLParser.__init__(self)
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, *_):
461 self.close()
462
463 def close(self):
464 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
465 # so data remains buffered; we no longer have any interest in it, thus
466 # override this method to discard it
467 pass
468
469 def handle_starttag(self, tag, _):
470 self.tagstack.append(tag)
471
472 def handle_endtag(self, tag):
473 if not self.tagstack:
474 raise compat_HTMLParseError('no tags in the stack')
475 while self.tagstack:
476 inner_tag = self.tagstack.pop()
477 if inner_tag == tag:
478 break
479 else:
480 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
481 if not self.tagstack:
482 raise self.HTMLBreakOnClosingTagException()
483
484
485 # XXX: This should be far less strict
486 def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
521 class HTMLAttributeParser(html.parser.HTMLParser):
522 """Trivial HTML parser to gather the attributes for a single element"""
523
524 def __init__(self):
525 self.attrs = {}
526 html.parser.HTMLParser.__init__(self)
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
530 raise compat_HTMLParseError('done')
531
532
533 class HTMLListAttrsParser(html.parser.HTMLParser):
534 """HTML parser to gather the attributes for the elements of a list"""
535
536 def __init__(self):
537 html.parser.HTMLParser.__init__(self)
538 self.items = []
539 self._level = 0
540
541 def handle_starttag(self, tag, attrs):
542 if tag == 'li' and self._level == 0:
543 self.items.append(dict(attrs))
544 self._level += 1
545
546 def handle_endtag(self, tag):
547 self._level -= 1
548
549
550 def extract_attributes(html_element):
551 """Given a string for an HTML element such as
552 <el
553 a="foo" B="bar" c="&98;az" d=boz
554 empty= noval entity="&amp;"
555 sq='"' dq="'"
556 >
557 Decode and return a dictionary of attributes.
558 {
559 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
560 'empty': '', 'noval': None, 'entity': '&',
561 'sq': '"', 'dq': '\''
562 }.
563 """
564 parser = HTMLAttributeParser()
565 with contextlib.suppress(compat_HTMLParseError):
566 parser.feed(html_element)
567 parser.close()
568 return parser.attrs
569
570
571 def parse_list(webpage):
572 """Given a string for an series of HTML <li> elements,
573 return a dictionary of their attributes"""
574 parser = HTMLListAttrsParser()
575 parser.feed(webpage)
576 parser.close()
577 return parser.items
578
579
580 def clean_html(html):
581 """Clean an HTML snippet into a readable string"""
582
583 if html is None: # Convenience for sanitizing descriptions etc.
584 return html
585
586 html = re.sub(r'\s+', ' ', html)
587 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
588 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
589 # Strip html tags
590 html = re.sub('<.*?>', '', html)
591 # Replace html entities
592 html = unescapeHTML(html)
593 return html.strip()
594
595
596 class LenientJSONDecoder(json.JSONDecoder):
597 # TODO: Write tests
598 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
599 self.transform_source, self.ignore_extra = transform_source, ignore_extra
600 self._close_attempts = 2 * close_objects
601 super().__init__(*args, **kwargs)
602
603 @staticmethod
604 def _close_object(err):
605 doc = err.doc[:err.pos]
606 # We need to add comma first to get the correct error message
607 if err.msg.startswith('Expecting \',\''):
608 return doc + ','
609 elif not doc.endswith(','):
610 return
611
612 if err.msg.startswith('Expecting property name'):
613 return doc[:-1] + '}'
614 elif err.msg.startswith('Expecting value'):
615 return doc[:-1] + ']'
616
617 def decode(self, s):
618 if self.transform_source:
619 s = self.transform_source(s)
620 for attempt in range(self._close_attempts + 1):
621 try:
622 if self.ignore_extra:
623 return self.raw_decode(s.lstrip())[0]
624 return super().decode(s)
625 except json.JSONDecodeError as e:
626 if e.pos is None:
627 raise
628 elif attempt < self._close_attempts:
629 s = self._close_object(e)
630 if s is not None:
631 continue
632 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
633 assert False, 'Too many attempts to decode JSON'
634
635
636 def sanitize_open(filename, open_mode):
637 """Try to open the given filename, and slightly tweak it if this fails.
638
639 Attempts to open the given filename. If this fails, it tries to change
640 the filename slightly, step by step, until it's either able to open it
641 or it fails and raises a final exception, like the standard open()
642 function.
643
644 It returns the tuple (stream, definitive_file_name).
645 """
646 if filename == '-':
647 if sys.platform == 'win32':
648 import msvcrt
649
650 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
651 with contextlib.suppress(io.UnsupportedOperation):
652 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
653 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
654
655 for attempt in range(2):
656 try:
657 try:
658 if sys.platform == 'win32':
659 # FIXME: An exclusive lock also locks the file from being read.
660 # Since windows locks are mandatory, don't lock the file on windows (for now).
661 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
662 raise LockingUnsupportedError()
663 stream = locked_file(filename, open_mode, block=False).__enter__()
664 except OSError:
665 stream = open(filename, open_mode)
666 return stream, filename
667 except OSError as err:
668 if attempt or err.errno in (errno.EACCES,):
669 raise
670 old_filename, filename = filename, sanitize_path(filename)
671 if old_filename == filename:
672 raise
673
674
675 def timeconvert(timestr):
676 """Convert RFC 2822 defined time string into system timestamp"""
677 timestamp = None
678 timetuple = email.utils.parsedate_tz(timestr)
679 if timetuple is not None:
680 timestamp = email.utils.mktime_tz(timetuple)
681 return timestamp
682
683
684 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
685 """Sanitizes a string so it could be used as part of a filename.
686 @param restricted Use a stricter subset of allowed characters
687 @param is_id Whether this is an ID that should be kept unchanged if possible.
688 If unset, yt-dlp's new sanitization rules are in effect
689 """
690 if s == '':
691 return ''
692
693 def replace_insane(char):
694 if restricted and char in ACCENT_CHARS:
695 return ACCENT_CHARS[char]
696 elif not restricted and char == '\n':
697 return '\0 '
698 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
699 # Replace with their full-width unicode counterparts
700 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
701 elif char == '?' or ord(char) < 32 or ord(char) == 127:
702 return ''
703 elif char == '"':
704 return '' if restricted else '\''
705 elif char == ':':
706 return '\0_\0-' if restricted else '\0 \0-'
707 elif char in '\\/|*<>':
708 return '\0_'
709 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
710 return '\0_'
711 return char
712
713 # Replace look-alike Unicode glyphs
714 if restricted and (is_id is NO_DEFAULT or not is_id):
715 s = unicodedata.normalize('NFKC', s)
716 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
717 result = ''.join(map(replace_insane, s))
718 if is_id is NO_DEFAULT:
719 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
720 STRIP_RE = r'(?:\0.|[ _-])*'
721 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
722 result = result.replace('\0', '') or '_'
723
724 if not is_id:
725 while '__' in result:
726 result = result.replace('__', '_')
727 result = result.strip('_')
728 # Common case of "Foreign band name - English song title"
729 if restricted and result.startswith('-_'):
730 result = result[2:]
731 if result.startswith('-'):
732 result = '_' + result[len('-'):]
733 result = result.lstrip('.')
734 if not result:
735 result = '_'
736 return result
737
738
739 def sanitize_path(s, force=False):
740 """Sanitizes and normalizes path on Windows"""
741 if sys.platform == 'win32':
742 force = False
743 drive_or_unc, _ = os.path.splitdrive(s)
744 elif force:
745 drive_or_unc = ''
746 else:
747 return s
748
749 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
750 if drive_or_unc:
751 norm_path.pop(0)
752 sanitized_path = [
753 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
754 for path_part in norm_path]
755 if drive_or_unc:
756 sanitized_path.insert(0, drive_or_unc + os.path.sep)
757 elif force and s and s[0] == os.path.sep:
758 sanitized_path.insert(0, os.path.sep)
759 return os.path.join(*sanitized_path)
760
761
762 def sanitize_url(url, *, scheme='http'):
763 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
764 # the number of unwanted failures due to missing protocol
765 if url is None:
766 return
767 elif url.startswith('//'):
768 return f'{scheme}:{url}'
769 # Fix some common typos seen so far
770 COMMON_TYPOS = (
771 # https://github.com/ytdl-org/youtube-dl/issues/15649
772 (r'^httpss://', r'https://'),
773 # https://bx1.be/lives/direct-tv/
774 (r'^rmtp([es]?)://', r'rtmp\1://'),
775 )
776 for mistake, fixup in COMMON_TYPOS:
777 if re.match(mistake, url):
778 return re.sub(mistake, fixup, url)
779 return url
780
781
782 def extract_basic_auth(url):
783 parts = urllib.parse.urlsplit(url)
784 if parts.username is None:
785 return url, None
786 url = urllib.parse.urlunsplit(parts._replace(netloc=(
787 parts.hostname if parts.port is None
788 else '%s:%d' % (parts.hostname, parts.port))))
789 auth_payload = base64.b64encode(
790 ('%s:%s' % (parts.username, parts.password or '')).encode())
791 return url, f'Basic {auth_payload.decode()}'
792
793
794 def sanitized_Request(url, *args, **kwargs):
795 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
796 if auth_header is not None:
797 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
798 headers['Authorization'] = auth_header
799 return urllib.request.Request(url, *args, **kwargs)
800
801
802 def expand_path(s):
803 """Expand shell variables and ~"""
804 return os.path.expandvars(compat_expanduser(s))
805
806
807 def orderedSet(iterable, *, lazy=False):
808 """Remove all duplicates from the input iterable"""
809 def _iter():
810 seen = [] # Do not use set since the items can be unhashable
811 for x in iterable:
812 if x not in seen:
813 seen.append(x)
814 yield x
815
816 return _iter() if lazy else list(_iter())
817
818
819 def _htmlentity_transform(entity_with_semicolon):
820 """Transforms an HTML entity to a character."""
821 entity = entity_with_semicolon[:-1]
822
823 # Known non-numeric HTML entity
824 if entity in html.entities.name2codepoint:
825 return chr(html.entities.name2codepoint[entity])
826
827 # TODO: HTML5 allows entities without a semicolon.
828 # E.g. '&Eacuteric' should be decoded as 'Éric'.
829 if entity_with_semicolon in html.entities.html5:
830 return html.entities.html5[entity_with_semicolon]
831
832 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
833 if mobj is not None:
834 numstr = mobj.group(1)
835 if numstr.startswith('x'):
836 base = 16
837 numstr = '0%s' % numstr
838 else:
839 base = 10
840 # See https://github.com/ytdl-org/youtube-dl/issues/7518
841 with contextlib.suppress(ValueError):
842 return chr(int(numstr, base))
843
844 # Unknown entity in name, return its literal representation
845 return '&%s;' % entity
846
847
848 def unescapeHTML(s):
849 if s is None:
850 return None
851 assert isinstance(s, str)
852
853 return re.sub(
854 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
855
856
857 def escapeHTML(text):
858 return (
859 text
860 .replace('&', '&amp;')
861 .replace('<', '&lt;')
862 .replace('>', '&gt;')
863 .replace('"', '&quot;')
864 .replace("'", '&#39;')
865 )
866
867
868 class netrc_from_content(netrc.netrc):
869 def __init__(self, content):
870 self.hosts, self.macros = {}, {}
871 with io.StringIO(content) as stream:
872 self._parse('-', stream, False)
873
874
875 class Popen(subprocess.Popen):
876 if sys.platform == 'win32':
877 _startupinfo = subprocess.STARTUPINFO()
878 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
879 else:
880 _startupinfo = None
881
882 @staticmethod
883 def _fix_pyinstaller_ld_path(env):
884 """Restore LD_LIBRARY_PATH when using PyInstaller
885 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
886 https://github.com/yt-dlp/yt-dlp/issues/4573
887 """
888 if not hasattr(sys, '_MEIPASS'):
889 return
890
891 def _fix(key):
892 orig = env.get(f'{key}_ORIG')
893 if orig is None:
894 env.pop(key, None)
895 else:
896 env[key] = orig
897
898 _fix('LD_LIBRARY_PATH') # Linux
899 _fix('DYLD_LIBRARY_PATH') # macOS
900
901 def __init__(self, *args, env=None, text=False, **kwargs):
902 if env is None:
903 env = os.environ.copy()
904 self._fix_pyinstaller_ld_path(env)
905
906 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
907 if text is True:
908 kwargs['universal_newlines'] = True # For 3.6 compatibility
909 kwargs.setdefault('encoding', 'utf-8')
910 kwargs.setdefault('errors', 'replace')
911 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
912
913 def communicate_or_kill(self, *args, **kwargs):
914 try:
915 return self.communicate(*args, **kwargs)
916 except BaseException: # Including KeyboardInterrupt
917 self.kill(timeout=None)
918 raise
919
920 def kill(self, *, timeout=0):
921 super().kill()
922 if timeout != 0:
923 self.wait(timeout=timeout)
924
925 @classmethod
926 def run(cls, *args, timeout=None, **kwargs):
927 with cls(*args, **kwargs) as proc:
928 default = '' if proc.__text_mode else b''
929 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
930 return stdout or default, stderr or default, proc.returncode
931
932
933 def encodeArgument(s):
934 # Legacy code that uses byte strings
935 # Uncomment the following line after fixing all post processors
936 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
937 return s if isinstance(s, str) else s.decode('ascii')
938
939
940 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
941
942
943 def timetuple_from_msec(msec):
944 secs, msec = divmod(msec, 1000)
945 mins, secs = divmod(secs, 60)
946 hrs, mins = divmod(mins, 60)
947 return _timetuple(hrs, mins, secs, msec)
948
949
950 def formatSeconds(secs, delim=':', msec=False):
951 time = timetuple_from_msec(secs * 1000)
952 if time.hours:
953 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
954 elif time.minutes:
955 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
956 else:
957 ret = '%d' % time.seconds
958 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
959
960
961 def _ssl_load_windows_store_certs(ssl_context, storename):
962 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
963 try:
964 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
965 if encoding == 'x509_asn' and (
966 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
967 except PermissionError:
968 return
969 for cert in certs:
970 with contextlib.suppress(ssl.SSLError):
971 ssl_context.load_verify_locations(cadata=cert)
972
973
974 def make_HTTPS_handler(params, **kwargs):
975 opts_check_certificate = not params.get('nocheckcertificate')
976 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
977 context.check_hostname = opts_check_certificate
978 if params.get('legacyserverconnect'):
979 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
980 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
981 context.set_ciphers('DEFAULT')
982 elif (
983 sys.version_info < (3, 10)
984 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
985 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
986 ):
987 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
988 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
989 # in some situations [2][3].
990 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
991 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
992 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
993 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
994 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
995 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
996 # 4. https://peps.python.org/pep-0644/
997 # 5. https://peps.python.org/pep-0644/#libressl-support
998 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
999 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1000 context.minimum_version = ssl.TLSVersion.TLSv1_2
1001
1002 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1003 if opts_check_certificate:
1004 if certifi and 'no-certifi' not in params.get('compat_opts', []):
1005 context.load_verify_locations(cafile=certifi.where())
1006 else:
1007 try:
1008 context.load_default_certs()
1009 # Work around the issue in load_default_certs when there are bad certificates. See:
1010 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1011 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1012 except ssl.SSLError:
1013 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1014 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1015 for storename in ('CA', 'ROOT'):
1016 _ssl_load_windows_store_certs(context, storename)
1017 context.set_default_verify_paths()
1018
1019 client_certfile = params.get('client_certificate')
1020 if client_certfile:
1021 try:
1022 context.load_cert_chain(
1023 client_certfile, keyfile=params.get('client_certificate_key'),
1024 password=params.get('client_certificate_password'))
1025 except ssl.SSLError:
1026 raise YoutubeDLError('Unable to load client certificate')
1027
1028 # Some servers may reject requests if ALPN extension is not sent. See:
1029 # https://github.com/python/cpython/issues/85140
1030 # https://github.com/yt-dlp/yt-dlp/issues/3878
1031 with contextlib.suppress(NotImplementedError):
1032 context.set_alpn_protocols(['http/1.1'])
1033
1034 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1035
1036
1037 def bug_reports_message(before=';'):
1038 from ..update import REPOSITORY
1039
1040 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1041 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1042
1043 before = before.rstrip()
1044 if not before or before.endswith(('.', '!', '?')):
1045 msg = msg[0].title() + msg[1:]
1046
1047 return (before + ' ' if before else '') + msg
1048
1049
1050 class YoutubeDLError(Exception):
1051 """Base exception for YoutubeDL errors."""
1052 msg = None
1053
1054 def __init__(self, msg=None):
1055 if msg is not None:
1056 self.msg = msg
1057 elif self.msg is None:
1058 self.msg = type(self).__name__
1059 super().__init__(self.msg)
1060
1061
1062 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1063 if hasattr(ssl, 'CertificateError'):
1064 network_exceptions.append(ssl.CertificateError)
1065 network_exceptions = tuple(network_exceptions)
1066
1067
1068 class ExtractorError(YoutubeDLError):
1069 """Error during info extraction."""
1070
1071 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1072 """ tb, if given, is the original traceback (so that it can be printed out).
1073 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1074 """
1075 if sys.exc_info()[0] in network_exceptions:
1076 expected = True
1077
1078 self.orig_msg = str(msg)
1079 self.traceback = tb
1080 self.expected = expected
1081 self.cause = cause
1082 self.video_id = video_id
1083 self.ie = ie
1084 self.exc_info = sys.exc_info() # preserve original exception
1085 if isinstance(self.exc_info[1], ExtractorError):
1086 self.exc_info = self.exc_info[1].exc_info
1087 super().__init__(self.__msg)
1088
1089 @property
1090 def __msg(self):
1091 return ''.join((
1092 format_field(self.ie, None, '[%s] '),
1093 format_field(self.video_id, None, '%s: '),
1094 self.orig_msg,
1095 format_field(self.cause, None, ' (caused by %r)'),
1096 '' if self.expected else bug_reports_message()))
1097
1098 def format_traceback(self):
1099 return join_nonempty(
1100 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1101 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1102 delim='\n') or None
1103
1104 def __setattr__(self, name, value):
1105 super().__setattr__(name, value)
1106 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1107 self.msg = self.__msg or type(self).__name__
1108 self.args = (self.msg, ) # Cannot be property
1109
1110
1111 class UnsupportedError(ExtractorError):
1112 def __init__(self, url):
1113 super().__init__(
1114 'Unsupported URL: %s' % url, expected=True)
1115 self.url = url
1116
1117
1118 class RegexNotFoundError(ExtractorError):
1119 """Error when a regex didn't match"""
1120 pass
1121
1122
1123 class GeoRestrictedError(ExtractorError):
1124 """Geographic restriction Error exception.
1125
1126 This exception may be thrown when a video is not available from your
1127 geographic location due to geographic restrictions imposed by a website.
1128 """
1129
1130 def __init__(self, msg, countries=None, **kwargs):
1131 kwargs['expected'] = True
1132 super().__init__(msg, **kwargs)
1133 self.countries = countries
1134
1135
1136 class UserNotLive(ExtractorError):
1137 """Error when a channel/user is not live"""
1138
1139 def __init__(self, msg=None, **kwargs):
1140 kwargs['expected'] = True
1141 super().__init__(msg or 'The channel is not currently live', **kwargs)
1142
1143
1144 class DownloadError(YoutubeDLError):
1145 """Download Error exception.
1146
1147 This exception may be thrown by FileDownloader objects if they are not
1148 configured to continue on errors. They will contain the appropriate
1149 error message.
1150 """
1151
1152 def __init__(self, msg, exc_info=None):
1153 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1154 super().__init__(msg)
1155 self.exc_info = exc_info
1156
1157
1158 class EntryNotInPlaylist(YoutubeDLError):
1159 """Entry not in playlist exception.
1160
1161 This exception will be thrown by YoutubeDL when a requested entry
1162 is not found in the playlist info_dict
1163 """
1164 msg = 'Entry not found in info'
1165
1166
1167 class SameFileError(YoutubeDLError):
1168 """Same File exception.
1169
1170 This exception will be thrown by FileDownloader objects if they detect
1171 multiple files would have to be downloaded to the same file on disk.
1172 """
1173 msg = 'Fixed output name but more than one file to download'
1174
1175 def __init__(self, filename=None):
1176 if filename is not None:
1177 self.msg += f': {filename}'
1178 super().__init__(self.msg)
1179
1180
1181 class PostProcessingError(YoutubeDLError):
1182 """Post Processing exception.
1183
1184 This exception may be raised by PostProcessor's .run() method to
1185 indicate an error in the postprocessing task.
1186 """
1187
1188
1189 class DownloadCancelled(YoutubeDLError):
1190 """ Exception raised when the download queue should be interrupted """
1191 msg = 'The download was cancelled'
1192
1193
1194 class ExistingVideoReached(DownloadCancelled):
1195 """ --break-on-existing triggered """
1196 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1197
1198
1199 class RejectedVideoReached(DownloadCancelled):
1200 """ --break-match-filter triggered """
1201 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1202
1203
1204 class MaxDownloadsReached(DownloadCancelled):
1205 """ --max-downloads limit has been reached. """
1206 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1207
1208
1209 class ReExtractInfo(YoutubeDLError):
1210 """ Video info needs to be re-extracted. """
1211
1212 def __init__(self, msg, expected=False):
1213 super().__init__(msg)
1214 self.expected = expected
1215
1216
1217 class ThrottledDownload(ReExtractInfo):
1218 """ Download speed below --throttled-rate. """
1219 msg = 'The download speed is below throttle limit'
1220
1221 def __init__(self):
1222 super().__init__(self.msg, expected=False)
1223
1224
1225 class UnavailableVideoError(YoutubeDLError):
1226 """Unavailable Format exception.
1227
1228 This exception will be thrown when a video is requested
1229 in a format that is not available for that video.
1230 """
1231 msg = 'Unable to download video'
1232
1233 def __init__(self, err=None):
1234 if err is not None:
1235 self.msg += f': {err}'
1236 super().__init__(self.msg)
1237
1238
1239 class ContentTooShortError(YoutubeDLError):
1240 """Content Too Short exception.
1241
1242 This exception may be raised by FileDownloader objects when a file they
1243 download is too small for what the server announced first, indicating
1244 the connection was probably interrupted.
1245 """
1246
1247 def __init__(self, downloaded, expected):
1248 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1249 # Both in bytes
1250 self.downloaded = downloaded
1251 self.expected = expected
1252
1253
1254 class XAttrMetadataError(YoutubeDLError):
1255 def __init__(self, code=None, msg='Unknown error'):
1256 super().__init__(msg)
1257 self.code = code
1258 self.msg = msg
1259
1260 # Parsing code and msg
1261 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1262 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1263 self.reason = 'NO_SPACE'
1264 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1265 self.reason = 'VALUE_TOO_LONG'
1266 else:
1267 self.reason = 'NOT_SUPPORTED'
1268
1269
1270 class XAttrUnavailableError(YoutubeDLError):
1271 pass
1272
1273
1274 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1275 hc = http_class(*args, **kwargs)
1276 source_address = ydl_handler._params.get('source_address')
1277
1278 if source_address is not None:
1279 # This is to workaround _create_connection() from socket where it will try all
1280 # address data from getaddrinfo() including IPv6. This filters the result from
1281 # getaddrinfo() based on the source_address value.
1282 # This is based on the cpython socket.create_connection() function.
1283 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1284 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1285 host, port = address
1286 err = None
1287 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1288 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1289 ip_addrs = [addr for addr in addrs if addr[0] == af]
1290 if addrs and not ip_addrs:
1291 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1292 raise OSError(
1293 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1294 % (ip_version, source_address[0]))
1295 for res in ip_addrs:
1296 af, socktype, proto, canonname, sa = res
1297 sock = None
1298 try:
1299 sock = socket.socket(af, socktype, proto)
1300 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1301 sock.settimeout(timeout)
1302 sock.bind(source_address)
1303 sock.connect(sa)
1304 err = None # Explicitly break reference cycle
1305 return sock
1306 except OSError as _:
1307 err = _
1308 if sock is not None:
1309 sock.close()
1310 if err is not None:
1311 raise err
1312 else:
1313 raise OSError('getaddrinfo returns an empty list')
1314 if hasattr(hc, '_create_connection'):
1315 hc._create_connection = _create_connection
1316 hc.source_address = (source_address, 0)
1317
1318 return hc
1319
1320
1321 class YoutubeDLHandler(urllib.request.HTTPHandler):
1322 """Handler for HTTP requests and responses.
1323
1324 This class, when installed with an OpenerDirector, automatically adds
1325 the standard headers to every HTTP request and handles gzipped, deflated and
1326 brotli responses from web servers.
1327
1328 Part of this code was copied from:
1329
1330 http://techknack.net/python-urllib2-handlers/
1331
1332 Andrew Rowls, the author of that code, agreed to release it to the
1333 public domain.
1334 """
1335
1336 def __init__(self, params, *args, **kwargs):
1337 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1338 self._params = params
1339
1340 def http_open(self, req):
1341 conn_class = http.client.HTTPConnection
1342
1343 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1344 if socks_proxy:
1345 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1346 del req.headers['Ytdl-socks-proxy']
1347
1348 return self.do_open(functools.partial(
1349 _create_http_connection, self, conn_class, False),
1350 req)
1351
1352 @staticmethod
1353 def deflate(data):
1354 if not data:
1355 return data
1356 try:
1357 return zlib.decompress(data, -zlib.MAX_WBITS)
1358 except zlib.error:
1359 return zlib.decompress(data)
1360
1361 @staticmethod
1362 def brotli(data):
1363 if not data:
1364 return data
1365 return brotli.decompress(data)
1366
1367 @staticmethod
1368 def gz(data):
1369 gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1370 try:
1371 return gz.read()
1372 except OSError as original_oserror:
1373 # There may be junk add the end of the file
1374 # See http://stackoverflow.com/q/4928560/35070 for details
1375 for i in range(1, 1024):
1376 try:
1377 gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1378 return gz.read()
1379 except OSError:
1380 continue
1381 else:
1382 raise original_oserror
1383
1384 def http_request(self, req):
1385 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1386 # always respected by websites, some tend to give out URLs with non percent-encoded
1387 # non-ASCII characters (see telemb.py, ard.py [#3412])
1388 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1389 # To work around aforementioned issue we will replace request's original URL with
1390 # percent-encoded one
1391 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1392 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1393 url = req.get_full_url()
1394 url_escaped = escape_url(url)
1395
1396 # Substitute URL if any change after escaping
1397 if url != url_escaped:
1398 req = update_Request(req, url=url_escaped)
1399
1400 for h, v in self._params.get('http_headers', std_headers).items():
1401 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1402 # The dict keys are capitalized because of this bug by urllib
1403 if h.capitalize() not in req.headers:
1404 req.add_header(h, v)
1405
1406 if 'Youtubedl-no-compression' in req.headers: # deprecated
1407 req.headers.pop('Youtubedl-no-compression', None)
1408 req.add_header('Accept-encoding', 'identity')
1409
1410 if 'Accept-encoding' not in req.headers:
1411 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1412
1413 return super().do_request_(req)
1414
1415 def http_response(self, req, resp):
1416 old_resp = resp
1417
1418 # Content-Encoding header lists the encodings in order that they were applied [1].
1419 # To decompress, we simply do the reverse.
1420 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1421 decoded_response = None
1422 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1423 if encoding == 'gzip':
1424 decoded_response = self.gz(decoded_response or resp.read())
1425 elif encoding == 'deflate':
1426 decoded_response = self.deflate(decoded_response or resp.read())
1427 elif encoding == 'br' and brotli:
1428 decoded_response = self.brotli(decoded_response or resp.read())
1429
1430 if decoded_response is not None:
1431 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1432 resp.msg = old_resp.msg
1433 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1434 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1435 if 300 <= resp.code < 400:
1436 location = resp.headers.get('Location')
1437 if location:
1438 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1439 location = location.encode('iso-8859-1').decode()
1440 location_escaped = escape_url(location)
1441 if location != location_escaped:
1442 del resp.headers['Location']
1443 resp.headers['Location'] = location_escaped
1444 return resp
1445
1446 https_request = http_request
1447 https_response = http_response
1448
1449
1450 def make_socks_conn_class(base_class, socks_proxy):
1451 assert issubclass(base_class, (
1452 http.client.HTTPConnection, http.client.HTTPSConnection))
1453
1454 url_components = urllib.parse.urlparse(socks_proxy)
1455 if url_components.scheme.lower() == 'socks5':
1456 socks_type = ProxyType.SOCKS5
1457 elif url_components.scheme.lower() in ('socks', 'socks4'):
1458 socks_type = ProxyType.SOCKS4
1459 elif url_components.scheme.lower() == 'socks4a':
1460 socks_type = ProxyType.SOCKS4A
1461
1462 def unquote_if_non_empty(s):
1463 if not s:
1464 return s
1465 return urllib.parse.unquote_plus(s)
1466
1467 proxy_args = (
1468 socks_type,
1469 url_components.hostname, url_components.port or 1080,
1470 True, # Remote DNS
1471 unquote_if_non_empty(url_components.username),
1472 unquote_if_non_empty(url_components.password),
1473 )
1474
1475 class SocksConnection(base_class):
1476 def connect(self):
1477 self.sock = sockssocket()
1478 self.sock.setproxy(*proxy_args)
1479 if isinstance(self.timeout, (int, float)):
1480 self.sock.settimeout(self.timeout)
1481 self.sock.connect((self.host, self.port))
1482
1483 if isinstance(self, http.client.HTTPSConnection):
1484 if hasattr(self, '_context'): # Python > 2.6
1485 self.sock = self._context.wrap_socket(
1486 self.sock, server_hostname=self.host)
1487 else:
1488 self.sock = ssl.wrap_socket(self.sock)
1489
1490 return SocksConnection
1491
1492
1493 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1494 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1495 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1496 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1497 self._params = params
1498
1499 def https_open(self, req):
1500 kwargs = {}
1501 conn_class = self._https_conn_class
1502
1503 if hasattr(self, '_context'): # python > 2.6
1504 kwargs['context'] = self._context
1505 if hasattr(self, '_check_hostname'): # python 3.x
1506 kwargs['check_hostname'] = self._check_hostname
1507
1508 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1509 if socks_proxy:
1510 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1511 del req.headers['Ytdl-socks-proxy']
1512
1513 try:
1514 return self.do_open(
1515 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1516 except urllib.error.URLError as e:
1517 if (isinstance(e.reason, ssl.SSLError)
1518 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1519 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1520 raise
1521
1522
1523 def is_path_like(f):
1524 return isinstance(f, (str, bytes, os.PathLike))
1525
1526
1527 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1528 def __init__(self, cookiejar=None):
1529 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1530
1531 def http_response(self, request, response):
1532 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1533
1534 https_request = urllib.request.HTTPCookieProcessor.http_request
1535 https_response = http_response
1536
1537
1538 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1539 """YoutubeDL redirect handler
1540
1541 The code is based on HTTPRedirectHandler implementation from CPython [1].
1542
1543 This redirect handler fixes and improves the logic to better align with RFC7261
1544 and what browsers tend to do [2][3]
1545
1546 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1547 2. https://datatracker.ietf.org/doc/html/rfc7231
1548 3. https://github.com/python/cpython/issues/91306
1549 """
1550
1551 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1552
1553 def redirect_request(self, req, fp, code, msg, headers, newurl):
1554 if code not in (301, 302, 303, 307, 308):
1555 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1556
1557 new_method = req.get_method()
1558 new_data = req.data
1559
1560 # Technically the Cookie header should be in unredirected_hdrs,
1561 # however in practice some may set it in normal headers anyway.
1562 # We will remove it here to prevent any leaks.
1563 remove_headers = ['Cookie']
1564
1565 # A 303 must either use GET or HEAD for subsequent request
1566 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1567 if code == 303 and req.get_method() != 'HEAD':
1568 new_method = 'GET'
1569 # 301 and 302 redirects are commonly turned into a GET from a POST
1570 # for subsequent requests by browsers, so we'll do the same.
1571 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1572 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1573 elif code in (301, 302) and req.get_method() == 'POST':
1574 new_method = 'GET'
1575
1576 # only remove payload if method changed (e.g. POST to GET)
1577 if new_method != req.get_method():
1578 new_data = None
1579 remove_headers.extend(['Content-Length', 'Content-Type'])
1580
1581 new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
1582
1583 return urllib.request.Request(
1584 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1585 unverifiable=True, method=new_method, data=new_data)
1586
1587
1588 def extract_timezone(date_str):
1589 m = re.search(
1590 r'''(?x)
1591 ^.{8,}? # >=8 char non-TZ prefix, if present
1592 (?P<tz>Z| # just the UTC Z, or
1593 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1594 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1595 [ ]? # optional space
1596 (?P<sign>\+|-) # +/-
1597 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1598 $)
1599 ''', date_str)
1600 if not m:
1601 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1602 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1603 if timezone is not None:
1604 date_str = date_str[:-len(m.group('tz'))]
1605 timezone = datetime.timedelta(hours=timezone or 0)
1606 else:
1607 date_str = date_str[:-len(m.group('tz'))]
1608 if not m.group('sign'):
1609 timezone = datetime.timedelta()
1610 else:
1611 sign = 1 if m.group('sign') == '+' else -1
1612 timezone = datetime.timedelta(
1613 hours=sign * int(m.group('hours')),
1614 minutes=sign * int(m.group('minutes')))
1615 return timezone, date_str
1616
1617
1618 def parse_iso8601(date_str, delimiter='T', timezone=None):
1619 """ Return a UNIX timestamp from the given date """
1620
1621 if date_str is None:
1622 return None
1623
1624 date_str = re.sub(r'\.[0-9]+', '', date_str)
1625
1626 if timezone is None:
1627 timezone, date_str = extract_timezone(date_str)
1628
1629 with contextlib.suppress(ValueError):
1630 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1631 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1632 return calendar.timegm(dt.timetuple())
1633
1634
1635 def date_formats(day_first=True):
1636 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1637
1638
1639 def unified_strdate(date_str, day_first=True):
1640 """Return a string with the date in the format YYYYMMDD"""
1641
1642 if date_str is None:
1643 return None
1644 upload_date = None
1645 # Replace commas
1646 date_str = date_str.replace(',', ' ')
1647 # Remove AM/PM + timezone
1648 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1649 _, date_str = extract_timezone(date_str)
1650
1651 for expression in date_formats(day_first):
1652 with contextlib.suppress(ValueError):
1653 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1654 if upload_date is None:
1655 timetuple = email.utils.parsedate_tz(date_str)
1656 if timetuple:
1657 with contextlib.suppress(ValueError):
1658 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1659 if upload_date is not None:
1660 return str(upload_date)
1661
1662
1663 def unified_timestamp(date_str, day_first=True):
1664 if not isinstance(date_str, str):
1665 return None
1666
1667 date_str = re.sub(r'\s+', ' ', re.sub(
1668 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1669
1670 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1671 timezone, date_str = extract_timezone(date_str)
1672
1673 # Remove AM/PM + timezone
1674 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1675
1676 # Remove unrecognized timezones from ISO 8601 alike timestamps
1677 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1678 if m:
1679 date_str = date_str[:-len(m.group('tz'))]
1680
1681 # Python only supports microseconds, so remove nanoseconds
1682 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1683 if m:
1684 date_str = m.group(1)
1685
1686 for expression in date_formats(day_first):
1687 with contextlib.suppress(ValueError):
1688 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1689 return calendar.timegm(dt.timetuple())
1690
1691 timetuple = email.utils.parsedate_tz(date_str)
1692 if timetuple:
1693 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1694
1695
1696 def determine_ext(url, default_ext='unknown_video'):
1697 if url is None or '.' not in url:
1698 return default_ext
1699 guess = url.partition('?')[0].rpartition('.')[2]
1700 if re.match(r'^[A-Za-z0-9]+$', guess):
1701 return guess
1702 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1703 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1704 return guess.rstrip('/')
1705 else:
1706 return default_ext
1707
1708
1709 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1710 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1711
1712
1713 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1714 R"""
1715 Return a datetime object from a string.
1716 Supported format:
1717 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1718
1719 @param format strftime format of DATE
1720 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1721 auto: round to the unit provided in date_str (if applicable).
1722 """
1723 auto_precision = False
1724 if precision == 'auto':
1725 auto_precision = True
1726 precision = 'microsecond'
1727 today = datetime_round(datetime.datetime.utcnow(), precision)
1728 if date_str in ('now', 'today'):
1729 return today
1730 if date_str == 'yesterday':
1731 return today - datetime.timedelta(days=1)
1732 match = re.match(
1733 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1734 date_str)
1735 if match is not None:
1736 start_time = datetime_from_str(match.group('start'), precision, format)
1737 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1738 unit = match.group('unit')
1739 if unit == 'month' or unit == 'year':
1740 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1741 unit = 'day'
1742 else:
1743 if unit == 'week':
1744 unit = 'day'
1745 time *= 7
1746 delta = datetime.timedelta(**{unit + 's': time})
1747 new_date = start_time + delta
1748 if auto_precision:
1749 return datetime_round(new_date, unit)
1750 return new_date
1751
1752 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1753
1754
1755 def date_from_str(date_str, format='%Y%m%d', strict=False):
1756 R"""
1757 Return a date object from a string using datetime_from_str
1758
1759 @param strict Restrict allowed patterns to "YYYYMMDD" and
1760 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1761 """
1762 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1763 raise ValueError(f'Invalid date format "{date_str}"')
1764 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1765
1766
1767 def datetime_add_months(dt, months):
1768 """Increment/Decrement a datetime object by months."""
1769 month = dt.month + months - 1
1770 year = dt.year + month // 12
1771 month = month % 12 + 1
1772 day = min(dt.day, calendar.monthrange(year, month)[1])
1773 return dt.replace(year, month, day)
1774
1775
1776 def datetime_round(dt, precision='day'):
1777 """
1778 Round a datetime object's time to a specific precision
1779 """
1780 if precision == 'microsecond':
1781 return dt
1782
1783 unit_seconds = {
1784 'day': 86400,
1785 'hour': 3600,
1786 'minute': 60,
1787 'second': 1,
1788 }
1789 roundto = lambda x, n: ((x + n / 2) // n) * n
1790 timestamp = calendar.timegm(dt.timetuple())
1791 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1792
1793
1794 def hyphenate_date(date_str):
1795 """
1796 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1797 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1798 if match is not None:
1799 return '-'.join(match.groups())
1800 else:
1801 return date_str
1802
1803
1804 class DateRange:
1805 """Represents a time interval between two dates"""
1806
1807 def __init__(self, start=None, end=None):
1808 """start and end must be strings in the format accepted by date"""
1809 if start is not None:
1810 self.start = date_from_str(start, strict=True)
1811 else:
1812 self.start = datetime.datetime.min.date()
1813 if end is not None:
1814 self.end = date_from_str(end, strict=True)
1815 else:
1816 self.end = datetime.datetime.max.date()
1817 if self.start > self.end:
1818 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1819
1820 @classmethod
1821 def day(cls, day):
1822 """Returns a range that only contains the given day"""
1823 return cls(day, day)
1824
1825 def __contains__(self, date):
1826 """Check if the date is in the range"""
1827 if not isinstance(date, datetime.date):
1828 date = date_from_str(date)
1829 return self.start <= date <= self.end
1830
1831 def __repr__(self):
1832 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1833
1834 def __eq__(self, other):
1835 return (isinstance(other, DateRange)
1836 and self.start == other.start and self.end == other.end)
1837
1838
1839 @functools.cache
1840 def system_identifier():
1841 python_implementation = platform.python_implementation()
1842 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1843 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1844 libc_ver = []
1845 with contextlib.suppress(OSError): # We may not have access to the executable
1846 libc_ver = platform.libc_ver()
1847
1848 return 'Python %s (%s %s %s) - %s (%s%s)' % (
1849 platform.python_version(),
1850 python_implementation,
1851 platform.machine(),
1852 platform.architecture()[0],
1853 platform.platform(),
1854 ssl.OPENSSL_VERSION,
1855 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1856 )
1857
1858
1859 @functools.cache
1860 def get_windows_version():
1861 ''' Get Windows version. returns () if it's not running on Windows '''
1862 if compat_os_name == 'nt':
1863 return version_tuple(platform.win32_ver()[1])
1864 else:
1865 return ()
1866
1867
1868 def write_string(s, out=None, encoding=None):
1869 assert isinstance(s, str)
1870 out = out or sys.stderr
1871 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1872 if not out:
1873 return
1874
1875 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1876 s = re.sub(r'([\r\n]+)', r' \1', s)
1877
1878 enc, buffer = None, out
1879 if 'b' in getattr(out, 'mode', ''):
1880 enc = encoding or preferredencoding()
1881 elif hasattr(out, 'buffer'):
1882 buffer = out.buffer
1883 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1884
1885 buffer.write(s.encode(enc, 'ignore') if enc else s)
1886 out.flush()
1887
1888
1889 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1890 from .. import _IN_CLI
1891 if _IN_CLI:
1892 if msg in deprecation_warning._cache:
1893 return
1894 deprecation_warning._cache.add(msg)
1895 if printer:
1896 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1897 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1898 else:
1899 import warnings
1900 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1901
1902
1903 deprecation_warning._cache = set()
1904
1905
1906 def bytes_to_intlist(bs):
1907 if not bs:
1908 return []
1909 if isinstance(bs[0], int): # Python 3
1910 return list(bs)
1911 else:
1912 return [ord(c) for c in bs]
1913
1914
1915 def intlist_to_bytes(xs):
1916 if not xs:
1917 return b''
1918 return struct.pack('%dB' % len(xs), *xs)
1919
1920
1921 class LockingUnsupportedError(OSError):
1922 msg = 'File locking is not supported'
1923
1924 def __init__(self):
1925 super().__init__(self.msg)
1926
1927
1928 # Cross-platform file locking
1929 if sys.platform == 'win32':
1930 import ctypes
1931 import ctypes.wintypes
1932 import msvcrt
1933
1934 class OVERLAPPED(ctypes.Structure):
1935 _fields_ = [
1936 ('Internal', ctypes.wintypes.LPVOID),
1937 ('InternalHigh', ctypes.wintypes.LPVOID),
1938 ('Offset', ctypes.wintypes.DWORD),
1939 ('OffsetHigh', ctypes.wintypes.DWORD),
1940 ('hEvent', ctypes.wintypes.HANDLE),
1941 ]
1942
1943 kernel32 = ctypes.WinDLL('kernel32')
1944 LockFileEx = kernel32.LockFileEx
1945 LockFileEx.argtypes = [
1946 ctypes.wintypes.HANDLE, # hFile
1947 ctypes.wintypes.DWORD, # dwFlags
1948 ctypes.wintypes.DWORD, # dwReserved
1949 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1950 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1951 ctypes.POINTER(OVERLAPPED) # Overlapped
1952 ]
1953 LockFileEx.restype = ctypes.wintypes.BOOL
1954 UnlockFileEx = kernel32.UnlockFileEx
1955 UnlockFileEx.argtypes = [
1956 ctypes.wintypes.HANDLE, # hFile
1957 ctypes.wintypes.DWORD, # dwReserved
1958 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1959 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1960 ctypes.POINTER(OVERLAPPED) # Overlapped
1961 ]
1962 UnlockFileEx.restype = ctypes.wintypes.BOOL
1963 whole_low = 0xffffffff
1964 whole_high = 0x7fffffff
1965
1966 def _lock_file(f, exclusive, block):
1967 overlapped = OVERLAPPED()
1968 overlapped.Offset = 0
1969 overlapped.OffsetHigh = 0
1970 overlapped.hEvent = 0
1971 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1972
1973 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1974 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1975 0, whole_low, whole_high, f._lock_file_overlapped_p):
1976 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1977 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1978
1979 def _unlock_file(f):
1980 assert f._lock_file_overlapped_p
1981 handle = msvcrt.get_osfhandle(f.fileno())
1982 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1983 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1984
1985 else:
1986 try:
1987 import fcntl
1988
1989 def _lock_file(f, exclusive, block):
1990 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1991 if not block:
1992 flags |= fcntl.LOCK_NB
1993 try:
1994 fcntl.flock(f, flags)
1995 except BlockingIOError:
1996 raise
1997 except OSError: # AOSP does not have flock()
1998 fcntl.lockf(f, flags)
1999
2000 def _unlock_file(f):
2001 with contextlib.suppress(OSError):
2002 return fcntl.flock(f, fcntl.LOCK_UN)
2003 with contextlib.suppress(OSError):
2004 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
2005 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
2006
2007 except ImportError:
2008
2009 def _lock_file(f, exclusive, block):
2010 raise LockingUnsupportedError()
2011
2012 def _unlock_file(f):
2013 raise LockingUnsupportedError()
2014
2015
2016 class locked_file:
2017 locked = False
2018
2019 def __init__(self, filename, mode, block=True, encoding=None):
2020 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2021 raise NotImplementedError(mode)
2022 self.mode, self.block = mode, block
2023
2024 writable = any(f in mode for f in 'wax+')
2025 readable = any(f in mode for f in 'r+')
2026 flags = functools.reduce(operator.ior, (
2027 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2028 getattr(os, 'O_BINARY', 0), # Windows only
2029 getattr(os, 'O_NOINHERIT', 0), # Windows only
2030 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2031 os.O_APPEND if 'a' in mode else 0,
2032 os.O_EXCL if 'x' in mode else 0,
2033 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2034 ))
2035
2036 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2037
2038 def __enter__(self):
2039 exclusive = 'r' not in self.mode
2040 try:
2041 _lock_file(self.f, exclusive, self.block)
2042 self.locked = True
2043 except OSError:
2044 self.f.close()
2045 raise
2046 if 'w' in self.mode:
2047 try:
2048 self.f.truncate()
2049 except OSError as e:
2050 if e.errno not in (
2051 errno.ESPIPE, # Illegal seek - expected for FIFO
2052 errno.EINVAL, # Invalid argument - expected for /dev/null
2053 ):
2054 raise
2055 return self
2056
2057 def unlock(self):
2058 if not self.locked:
2059 return
2060 try:
2061 _unlock_file(self.f)
2062 finally:
2063 self.locked = False
2064
2065 def __exit__(self, *_):
2066 try:
2067 self.unlock()
2068 finally:
2069 self.f.close()
2070
2071 open = __enter__
2072 close = __exit__
2073
2074 def __getattr__(self, attr):
2075 return getattr(self.f, attr)
2076
2077 def __iter__(self):
2078 return iter(self.f)
2079
2080
2081 @functools.cache
2082 def get_filesystem_encoding():
2083 encoding = sys.getfilesystemencoding()
2084 return encoding if encoding is not None else 'utf-8'
2085
2086
2087 def shell_quote(args):
2088 quoted_args = []
2089 encoding = get_filesystem_encoding()
2090 for a in args:
2091 if isinstance(a, bytes):
2092 # We may get a filename encoded with 'encodeFilename'
2093 a = a.decode(encoding)
2094 quoted_args.append(compat_shlex_quote(a))
2095 return ' '.join(quoted_args)
2096
2097
2098 def smuggle_url(url, data):
2099 """ Pass additional data in a URL for internal use. """
2100
2101 url, idata = unsmuggle_url(url, {})
2102 data.update(idata)
2103 sdata = urllib.parse.urlencode(
2104 {'__youtubedl_smuggle': json.dumps(data)})
2105 return url + '#' + sdata
2106
2107
2108 def unsmuggle_url(smug_url, default=None):
2109 if '#__youtubedl_smuggle' not in smug_url:
2110 return smug_url, default
2111 url, _, sdata = smug_url.rpartition('#')
2112 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2113 data = json.loads(jsond)
2114 return url, data
2115
2116
2117 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2118 """ Formats numbers with decimal sufixes like K, M, etc """
2119 num, factor = float_or_none(num), float(factor)
2120 if num is None or num < 0:
2121 return None
2122 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2123 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2124 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2125 if factor == 1024:
2126 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2127 converted = num / (factor ** exponent)
2128 return fmt % (converted, suffix)
2129
2130
2131 def format_bytes(bytes):
2132 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2133
2134
2135 def lookup_unit_table(unit_table, s, strict=False):
2136 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2137 units_re = '|'.join(re.escape(u) for u in unit_table)
2138 m = (re.fullmatch if strict else re.match)(
2139 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2140 if not m:
2141 return None
2142
2143 num = float(m.group('num').replace(',', '.'))
2144 mult = unit_table[m.group('unit')]
2145 return round(num * mult)
2146
2147
2148 def parse_bytes(s):
2149 """Parse a string indicating a byte quantity into an integer"""
2150 return lookup_unit_table(
2151 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2152 s.upper(), strict=True)
2153
2154
2155 def parse_filesize(s):
2156 if s is None:
2157 return None
2158
2159 # The lower-case forms are of course incorrect and unofficial,
2160 # but we support those too
2161 _UNIT_TABLE = {
2162 'B': 1,
2163 'b': 1,
2164 'bytes': 1,
2165 'KiB': 1024,
2166 'KB': 1000,
2167 'kB': 1024,
2168 'Kb': 1000,
2169 'kb': 1000,
2170 'kilobytes': 1000,
2171 'kibibytes': 1024,
2172 'MiB': 1024 ** 2,
2173 'MB': 1000 ** 2,
2174 'mB': 1024 ** 2,
2175 'Mb': 1000 ** 2,
2176 'mb': 1000 ** 2,
2177 'megabytes': 1000 ** 2,
2178 'mebibytes': 1024 ** 2,
2179 'GiB': 1024 ** 3,
2180 'GB': 1000 ** 3,
2181 'gB': 1024 ** 3,
2182 'Gb': 1000 ** 3,
2183 'gb': 1000 ** 3,
2184 'gigabytes': 1000 ** 3,
2185 'gibibytes': 1024 ** 3,
2186 'TiB': 1024 ** 4,
2187 'TB': 1000 ** 4,
2188 'tB': 1024 ** 4,
2189 'Tb': 1000 ** 4,
2190 'tb': 1000 ** 4,
2191 'terabytes': 1000 ** 4,
2192 'tebibytes': 1024 ** 4,
2193 'PiB': 1024 ** 5,
2194 'PB': 1000 ** 5,
2195 'pB': 1024 ** 5,
2196 'Pb': 1000 ** 5,
2197 'pb': 1000 ** 5,
2198 'petabytes': 1000 ** 5,
2199 'pebibytes': 1024 ** 5,
2200 'EiB': 1024 ** 6,
2201 'EB': 1000 ** 6,
2202 'eB': 1024 ** 6,
2203 'Eb': 1000 ** 6,
2204 'eb': 1000 ** 6,
2205 'exabytes': 1000 ** 6,
2206 'exbibytes': 1024 ** 6,
2207 'ZiB': 1024 ** 7,
2208 'ZB': 1000 ** 7,
2209 'zB': 1024 ** 7,
2210 'Zb': 1000 ** 7,
2211 'zb': 1000 ** 7,
2212 'zettabytes': 1000 ** 7,
2213 'zebibytes': 1024 ** 7,
2214 'YiB': 1024 ** 8,
2215 'YB': 1000 ** 8,
2216 'yB': 1024 ** 8,
2217 'Yb': 1000 ** 8,
2218 'yb': 1000 ** 8,
2219 'yottabytes': 1000 ** 8,
2220 'yobibytes': 1024 ** 8,
2221 }
2222
2223 return lookup_unit_table(_UNIT_TABLE, s)
2224
2225
2226 def parse_count(s):
2227 if s is None:
2228 return None
2229
2230 s = re.sub(r'^[^\d]+\s', '', s).strip()
2231
2232 if re.match(r'^[\d,.]+$', s):
2233 return str_to_int(s)
2234
2235 _UNIT_TABLE = {
2236 'k': 1000,
2237 'K': 1000,
2238 'm': 1000 ** 2,
2239 'M': 1000 ** 2,
2240 'kk': 1000 ** 2,
2241 'KK': 1000 ** 2,
2242 'b': 1000 ** 3,
2243 'B': 1000 ** 3,
2244 }
2245
2246 ret = lookup_unit_table(_UNIT_TABLE, s)
2247 if ret is not None:
2248 return ret
2249
2250 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2251 if mobj:
2252 return str_to_int(mobj.group(1))
2253
2254
2255 def parse_resolution(s, *, lenient=False):
2256 if s is None:
2257 return {}
2258
2259 if lenient:
2260 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2261 else:
2262 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2263 if mobj:
2264 return {
2265 'width': int(mobj.group('w')),
2266 'height': int(mobj.group('h')),
2267 }
2268
2269 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2270 if mobj:
2271 return {'height': int(mobj.group(1))}
2272
2273 mobj = re.search(r'\b([48])[kK]\b', s)
2274 if mobj:
2275 return {'height': int(mobj.group(1)) * 540}
2276
2277 return {}
2278
2279
2280 def parse_bitrate(s):
2281 if not isinstance(s, str):
2282 return
2283 mobj = re.search(r'\b(\d+)\s*kbps', s)
2284 if mobj:
2285 return int(mobj.group(1))
2286
2287
2288 def month_by_name(name, lang='en'):
2289 """ Return the number of a month by (locale-independently) English name """
2290
2291 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2292
2293 try:
2294 return month_names.index(name) + 1
2295 except ValueError:
2296 return None
2297
2298
2299 def month_by_abbreviation(abbrev):
2300 """ Return the number of a month by (locale-independently) English
2301 abbreviations """
2302
2303 try:
2304 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2305 except ValueError:
2306 return None
2307
2308
2309 def fix_xml_ampersands(xml_str):
2310 """Replace all the '&' by '&amp;' in XML"""
2311 return re.sub(
2312 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2313 '&amp;',
2314 xml_str)
2315
2316
2317 def setproctitle(title):
2318 assert isinstance(title, str)
2319
2320 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2321 try:
2322 import ctypes
2323 except ImportError:
2324 return
2325
2326 try:
2327 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2328 except OSError:
2329 return
2330 except TypeError:
2331 # LoadLibrary in Windows Python 2.7.13 only expects
2332 # a bytestring, but since unicode_literals turns
2333 # every string into a unicode string, it fails.
2334 return
2335 title_bytes = title.encode()
2336 buf = ctypes.create_string_buffer(len(title_bytes))
2337 buf.value = title_bytes
2338 try:
2339 libc.prctl(15, buf, 0, 0, 0)
2340 except AttributeError:
2341 return # Strange libc, just skip this
2342
2343
2344 def remove_start(s, start):
2345 return s[len(start):] if s is not None and s.startswith(start) else s
2346
2347
2348 def remove_end(s, end):
2349 return s[:-len(end)] if s is not None and s.endswith(end) else s
2350
2351
2352 def remove_quotes(s):
2353 if s is None or len(s) < 2:
2354 return s
2355 for quote in ('"', "'", ):
2356 if s[0] == quote and s[-1] == quote:
2357 return s[1:-1]
2358 return s
2359
2360
2361 def get_domain(url):
2362 """
2363 This implementation is inconsistent, but is kept for compatibility.
2364 Use this only for "webpage_url_domain"
2365 """
2366 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2367
2368
2369 def url_basename(url):
2370 path = urllib.parse.urlparse(url).path
2371 return path.strip('/').split('/')[-1]
2372
2373
2374 def base_url(url):
2375 return re.match(r'https?://[^?#]+/', url).group()
2376
2377
2378 def urljoin(base, path):
2379 if isinstance(path, bytes):
2380 path = path.decode()
2381 if not isinstance(path, str) or not path:
2382 return None
2383 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2384 return path
2385 if isinstance(base, bytes):
2386 base = base.decode()
2387 if not isinstance(base, str) or not re.match(
2388 r'^(?:https?:)?//', base):
2389 return None
2390 return urllib.parse.urljoin(base, path)
2391
2392
2393 class HEADRequest(urllib.request.Request):
2394 def get_method(self):
2395 return 'HEAD'
2396
2397
2398 class PUTRequest(urllib.request.Request):
2399 def get_method(self):
2400 return 'PUT'
2401
2402
2403 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2404 if get_attr and v is not None:
2405 v = getattr(v, get_attr, None)
2406 try:
2407 return int(v) * invscale // scale
2408 except (ValueError, TypeError, OverflowError):
2409 return default
2410
2411
2412 def str_or_none(v, default=None):
2413 return default if v is None else str(v)
2414
2415
2416 def str_to_int(int_str):
2417 """ A more relaxed version of int_or_none """
2418 if isinstance(int_str, int):
2419 return int_str
2420 elif isinstance(int_str, str):
2421 int_str = re.sub(r'[,\.\+]', '', int_str)
2422 return int_or_none(int_str)
2423
2424
2425 def float_or_none(v, scale=1, invscale=1, default=None):
2426 if v is None:
2427 return default
2428 try:
2429 return float(v) * invscale / scale
2430 except (ValueError, TypeError):
2431 return default
2432
2433
2434 def bool_or_none(v, default=None):
2435 return v if isinstance(v, bool) else default
2436
2437
2438 def strip_or_none(v, default=None):
2439 return v.strip() if isinstance(v, str) else default
2440
2441
2442 def url_or_none(url):
2443 if not url or not isinstance(url, str):
2444 return None
2445 url = url.strip()
2446 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2447
2448
2449 def request_to_url(req):
2450 if isinstance(req, urllib.request.Request):
2451 return req.get_full_url()
2452 else:
2453 return req
2454
2455
2456 def strftime_or_none(timestamp, date_format='%Y%m%d', default=None):
2457 datetime_object = None
2458 try:
2459 if isinstance(timestamp, (int, float)): # unix timestamp
2460 # Using naive datetime here can break timestamp() in Windows
2461 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2462 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2463 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2464 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2465 + datetime.timedelta(seconds=timestamp))
2466 elif isinstance(timestamp, str): # assume YYYYMMDD
2467 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2468 date_format = re.sub( # Support %s on windows
2469 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2470 return datetime_object.strftime(date_format)
2471 except (ValueError, TypeError, AttributeError):
2472 return default
2473
2474
2475 def parse_duration(s):
2476 if not isinstance(s, str):
2477 return None
2478 s = s.strip()
2479 if not s:
2480 return None
2481
2482 days, hours, mins, secs, ms = [None] * 5
2483 m = re.match(r'''(?x)
2484 (?P<before_secs>
2485 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2486 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2487 (?P<ms>[.:][0-9]+)?Z?$
2488 ''', s)
2489 if m:
2490 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2491 else:
2492 m = re.match(
2493 r'''(?ix)(?:P?
2494 (?:
2495 [0-9]+\s*y(?:ears?)?,?\s*
2496 )?
2497 (?:
2498 [0-9]+\s*m(?:onths?)?,?\s*
2499 )?
2500 (?:
2501 [0-9]+\s*w(?:eeks?)?,?\s*
2502 )?
2503 (?:
2504 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2505 )?
2506 T)?
2507 (?:
2508 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2509 )?
2510 (?:
2511 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2512 )?
2513 (?:
2514 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2515 )?Z?$''', s)
2516 if m:
2517 days, hours, mins, secs, ms = m.groups()
2518 else:
2519 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2520 if m:
2521 hours, mins = m.groups()
2522 else:
2523 return None
2524
2525 if ms:
2526 ms = ms.replace(':', '.')
2527 return sum(float(part or 0) * mult for part, mult in (
2528 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2529
2530
2531 def prepend_extension(filename, ext, expected_real_ext=None):
2532 name, real_ext = os.path.splitext(filename)
2533 return (
2534 f'{name}.{ext}{real_ext}'
2535 if not expected_real_ext or real_ext[1:] == expected_real_ext
2536 else f'{filename}.{ext}')
2537
2538
2539 def replace_extension(filename, ext, expected_real_ext=None):
2540 name, real_ext = os.path.splitext(filename)
2541 return '{}.{}'.format(
2542 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2543 ext)
2544
2545
2546 def check_executable(exe, args=[]):
2547 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2548 args can be a list of arguments for a short output (like -version) """
2549 try:
2550 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2551 except OSError:
2552 return False
2553 return exe
2554
2555
2556 def _get_exe_version_output(exe, args):
2557 try:
2558 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2559 # SIGTTOU if yt-dlp is run in the background.
2560 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2561 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2562 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2563 if ret:
2564 return None
2565 except OSError:
2566 return False
2567 return stdout
2568
2569
2570 def detect_exe_version(output, version_re=None, unrecognized='present'):
2571 assert isinstance(output, str)
2572 if version_re is None:
2573 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2574 m = re.search(version_re, output)
2575 if m:
2576 return m.group(1)
2577 else:
2578 return unrecognized
2579
2580
2581 def get_exe_version(exe, args=['--version'],
2582 version_re=None, unrecognized=('present', 'broken')):
2583 """ Returns the version of the specified executable,
2584 or False if the executable is not present """
2585 unrecognized = variadic(unrecognized)
2586 assert len(unrecognized) in (1, 2)
2587 out = _get_exe_version_output(exe, args)
2588 if out is None:
2589 return unrecognized[-1]
2590 return out and detect_exe_version(out, version_re, unrecognized[0])
2591
2592
2593 def frange(start=0, stop=None, step=1):
2594 """Float range"""
2595 if stop is None:
2596 start, stop = 0, start
2597 sign = [-1, 1][step > 0] if step else 0
2598 while sign * start < sign * stop:
2599 yield start
2600 start += step
2601
2602
2603 class LazyList(collections.abc.Sequence):
2604 """Lazy immutable list from an iterable
2605 Note that slices of a LazyList are lists and not LazyList"""
2606
2607 class IndexError(IndexError):
2608 pass
2609
2610 def __init__(self, iterable, *, reverse=False, _cache=None):
2611 self._iterable = iter(iterable)
2612 self._cache = [] if _cache is None else _cache
2613 self._reversed = reverse
2614
2615 def __iter__(self):
2616 if self._reversed:
2617 # We need to consume the entire iterable to iterate in reverse
2618 yield from self.exhaust()
2619 return
2620 yield from self._cache
2621 for item in self._iterable:
2622 self._cache.append(item)
2623 yield item
2624
2625 def _exhaust(self):
2626 self._cache.extend(self._iterable)
2627 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2628 return self._cache
2629
2630 def exhaust(self):
2631 """Evaluate the entire iterable"""
2632 return self._exhaust()[::-1 if self._reversed else 1]
2633
2634 @staticmethod
2635 def _reverse_index(x):
2636 return None if x is None else ~x
2637
2638 def __getitem__(self, idx):
2639 if isinstance(idx, slice):
2640 if self._reversed:
2641 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2642 start, stop, step = idx.start, idx.stop, idx.step or 1
2643 elif isinstance(idx, int):
2644 if self._reversed:
2645 idx = self._reverse_index(idx)
2646 start, stop, step = idx, idx, 0
2647 else:
2648 raise TypeError('indices must be integers or slices')
2649 if ((start or 0) < 0 or (stop or 0) < 0
2650 or (start is None and step < 0)
2651 or (stop is None and step > 0)):
2652 # We need to consume the entire iterable to be able to slice from the end
2653 # Obviously, never use this with infinite iterables
2654 self._exhaust()
2655 try:
2656 return self._cache[idx]
2657 except IndexError as e:
2658 raise self.IndexError(e) from e
2659 n = max(start or 0, stop or 0) - len(self._cache) + 1
2660 if n > 0:
2661 self._cache.extend(itertools.islice(self._iterable, n))
2662 try:
2663 return self._cache[idx]
2664 except IndexError as e:
2665 raise self.IndexError(e) from e
2666
2667 def __bool__(self):
2668 try:
2669 self[-1] if self._reversed else self[0]
2670 except self.IndexError:
2671 return False
2672 return True
2673
2674 def __len__(self):
2675 self._exhaust()
2676 return len(self._cache)
2677
2678 def __reversed__(self):
2679 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2680
2681 def __copy__(self):
2682 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2683
2684 def __repr__(self):
2685 # repr and str should mimic a list. So we exhaust the iterable
2686 return repr(self.exhaust())
2687
2688 def __str__(self):
2689 return repr(self.exhaust())
2690
2691
2692 class PagedList:
2693
2694 class IndexError(IndexError):
2695 pass
2696
2697 def __len__(self):
2698 # This is only useful for tests
2699 return len(self.getslice())
2700
2701 def __init__(self, pagefunc, pagesize, use_cache=True):
2702 self._pagefunc = pagefunc
2703 self._pagesize = pagesize
2704 self._pagecount = float('inf')
2705 self._use_cache = use_cache
2706 self._cache = {}
2707
2708 def getpage(self, pagenum):
2709 page_results = self._cache.get(pagenum)
2710 if page_results is None:
2711 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2712 if self._use_cache:
2713 self._cache[pagenum] = page_results
2714 return page_results
2715
2716 def getslice(self, start=0, end=None):
2717 return list(self._getslice(start, end))
2718
2719 def _getslice(self, start, end):
2720 raise NotImplementedError('This method must be implemented by subclasses')
2721
2722 def __getitem__(self, idx):
2723 assert self._use_cache, 'Indexing PagedList requires cache'
2724 if not isinstance(idx, int) or idx < 0:
2725 raise TypeError('indices must be non-negative integers')
2726 entries = self.getslice(idx, idx + 1)
2727 if not entries:
2728 raise self.IndexError()
2729 return entries[0]
2730
2731
2732 class OnDemandPagedList(PagedList):
2733 """Download pages until a page with less than maximum results"""
2734
2735 def _getslice(self, start, end):
2736 for pagenum in itertools.count(start // self._pagesize):
2737 firstid = pagenum * self._pagesize
2738 nextfirstid = pagenum * self._pagesize + self._pagesize
2739 if start >= nextfirstid:
2740 continue
2741
2742 startv = (
2743 start % self._pagesize
2744 if firstid <= start < nextfirstid
2745 else 0)
2746 endv = (
2747 ((end - 1) % self._pagesize) + 1
2748 if (end is not None and firstid <= end <= nextfirstid)
2749 else None)
2750
2751 try:
2752 page_results = self.getpage(pagenum)
2753 except Exception:
2754 self._pagecount = pagenum - 1
2755 raise
2756 if startv != 0 or endv is not None:
2757 page_results = page_results[startv:endv]
2758 yield from page_results
2759
2760 # A little optimization - if current page is not "full", ie. does
2761 # not contain page_size videos then we can assume that this page
2762 # is the last one - there are no more ids on further pages -
2763 # i.e. no need to query again.
2764 if len(page_results) + startv < self._pagesize:
2765 break
2766
2767 # If we got the whole page, but the next page is not interesting,
2768 # break out early as well
2769 if end == nextfirstid:
2770 break
2771
2772
2773 class InAdvancePagedList(PagedList):
2774 """PagedList with total number of pages known in advance"""
2775
2776 def __init__(self, pagefunc, pagecount, pagesize):
2777 PagedList.__init__(self, pagefunc, pagesize, True)
2778 self._pagecount = pagecount
2779
2780 def _getslice(self, start, end):
2781 start_page = start // self._pagesize
2782 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2783 skip_elems = start - start_page * self._pagesize
2784 only_more = None if end is None else end - start
2785 for pagenum in range(start_page, end_page):
2786 page_results = self.getpage(pagenum)
2787 if skip_elems:
2788 page_results = page_results[skip_elems:]
2789 skip_elems = None
2790 if only_more is not None:
2791 if len(page_results) < only_more:
2792 only_more -= len(page_results)
2793 else:
2794 yield from page_results[:only_more]
2795 break
2796 yield from page_results
2797
2798
2799 class PlaylistEntries:
2800 MissingEntry = object()
2801 is_exhausted = False
2802
2803 def __init__(self, ydl, info_dict):
2804 self.ydl = ydl
2805
2806 # _entries must be assigned now since infodict can change during iteration
2807 entries = info_dict.get('entries')
2808 if entries is None:
2809 raise EntryNotInPlaylist('There are no entries')
2810 elif isinstance(entries, list):
2811 self.is_exhausted = True
2812
2813 requested_entries = info_dict.get('requested_entries')
2814 self.is_incomplete = requested_entries is not None
2815 if self.is_incomplete:
2816 assert self.is_exhausted
2817 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2818 for i, entry in zip(requested_entries, entries):
2819 self._entries[i - 1] = entry
2820 elif isinstance(entries, (list, PagedList, LazyList)):
2821 self._entries = entries
2822 else:
2823 self._entries = LazyList(entries)
2824
2825 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2826 (?P<start>[+-]?\d+)?
2827 (?P<range>[:-]
2828 (?P<end>[+-]?\d+|inf(?:inite)?)?
2829 (?::(?P<step>[+-]?\d+))?
2830 )?''')
2831
2832 @classmethod
2833 def parse_playlist_items(cls, string):
2834 for segment in string.split(','):
2835 if not segment:
2836 raise ValueError('There is two or more consecutive commas')
2837 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2838 if not mobj:
2839 raise ValueError(f'{segment!r} is not a valid specification')
2840 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2841 if int_or_none(step) == 0:
2842 raise ValueError(f'Step in {segment!r} cannot be zero')
2843 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2844
2845 def get_requested_items(self):
2846 playlist_items = self.ydl.params.get('playlist_items')
2847 playlist_start = self.ydl.params.get('playliststart', 1)
2848 playlist_end = self.ydl.params.get('playlistend')
2849 # For backwards compatibility, interpret -1 as whole list
2850 if playlist_end in (-1, None):
2851 playlist_end = ''
2852 if not playlist_items:
2853 playlist_items = f'{playlist_start}:{playlist_end}'
2854 elif playlist_start != 1 or playlist_end:
2855 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2856
2857 for index in self.parse_playlist_items(playlist_items):
2858 for i, entry in self[index]:
2859 yield i, entry
2860 if not entry:
2861 continue
2862 try:
2863 # The item may have just been added to archive. Don't break due to it
2864 if not self.ydl.params.get('lazy_playlist'):
2865 # TODO: Add auto-generated fields
2866 self.ydl._match_entry(entry, incomplete=True, silent=True)
2867 except (ExistingVideoReached, RejectedVideoReached):
2868 return
2869
2870 def get_full_count(self):
2871 if self.is_exhausted and not self.is_incomplete:
2872 return len(self)
2873 elif isinstance(self._entries, InAdvancePagedList):
2874 if self._entries._pagesize == 1:
2875 return self._entries._pagecount
2876
2877 @functools.cached_property
2878 def _getter(self):
2879 if isinstance(self._entries, list):
2880 def get_entry(i):
2881 try:
2882 entry = self._entries[i]
2883 except IndexError:
2884 entry = self.MissingEntry
2885 if not self.is_incomplete:
2886 raise self.IndexError()
2887 if entry is self.MissingEntry:
2888 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2889 return entry
2890 else:
2891 def get_entry(i):
2892 try:
2893 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2894 except (LazyList.IndexError, PagedList.IndexError):
2895 raise self.IndexError()
2896 return get_entry
2897
2898 def __getitem__(self, idx):
2899 if isinstance(idx, int):
2900 idx = slice(idx, idx)
2901
2902 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2903 step = 1 if idx.step is None else idx.step
2904 if idx.start is None:
2905 start = 0 if step > 0 else len(self) - 1
2906 else:
2907 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2908
2909 # NB: Do not call len(self) when idx == [:]
2910 if idx.stop is None:
2911 stop = 0 if step < 0 else float('inf')
2912 else:
2913 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2914 stop += [-1, 1][step > 0]
2915
2916 for i in frange(start, stop, step):
2917 if i < 0:
2918 continue
2919 try:
2920 entry = self._getter(i)
2921 except self.IndexError:
2922 self.is_exhausted = True
2923 if step > 0:
2924 break
2925 continue
2926 yield i + 1, entry
2927
2928 def __len__(self):
2929 return len(tuple(self[:]))
2930
2931 class IndexError(IndexError):
2932 pass
2933
2934
2935 def uppercase_escape(s):
2936 unicode_escape = codecs.getdecoder('unicode_escape')
2937 return re.sub(
2938 r'\\U[0-9a-fA-F]{8}',
2939 lambda m: unicode_escape(m.group(0))[0],
2940 s)
2941
2942
2943 def lowercase_escape(s):
2944 unicode_escape = codecs.getdecoder('unicode_escape')
2945 return re.sub(
2946 r'\\u[0-9a-fA-F]{4}',
2947 lambda m: unicode_escape(m.group(0))[0],
2948 s)
2949
2950
2951 def escape_rfc3986(s):
2952 """Escape non-ASCII characters as suggested by RFC 3986"""
2953 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2954
2955
2956 def escape_url(url):
2957 """Escape URL as suggested by RFC 3986"""
2958 url_parsed = urllib.parse.urlparse(url)
2959 return url_parsed._replace(
2960 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2961 path=escape_rfc3986(url_parsed.path),
2962 params=escape_rfc3986(url_parsed.params),
2963 query=escape_rfc3986(url_parsed.query),
2964 fragment=escape_rfc3986(url_parsed.fragment)
2965 ).geturl()
2966
2967
2968 def parse_qs(url, **kwargs):
2969 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2970
2971
2972 def read_batch_urls(batch_fd):
2973 def fixup(url):
2974 if not isinstance(url, str):
2975 url = url.decode('utf-8', 'replace')
2976 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2977 for bom in BOM_UTF8:
2978 if url.startswith(bom):
2979 url = url[len(bom):]
2980 url = url.lstrip()
2981 if not url or url.startswith(('#', ';', ']')):
2982 return False
2983 # "#" cannot be stripped out since it is part of the URI
2984 # However, it can be safely stripped out if following a whitespace
2985 return re.split(r'\s#', url, 1)[0].rstrip()
2986
2987 with contextlib.closing(batch_fd) as fd:
2988 return [url for url in map(fixup, fd) if url]
2989
2990
2991 def urlencode_postdata(*args, **kargs):
2992 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2993
2994
2995 def update_url(url, *, query_update=None, **kwargs):
2996 """Replace URL components specified by kwargs
2997 @param url str or parse url tuple
2998 @param query_update update query
2999 @returns str
3000 """
3001 if isinstance(url, str):
3002 if not kwargs and not query_update:
3003 return url
3004 else:
3005 url = urllib.parse.urlparse(url)
3006 if query_update:
3007 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3008 kwargs['query'] = urllib.parse.urlencode({
3009 **urllib.parse.parse_qs(url.query),
3010 **query_update
3011 }, True)
3012 return urllib.parse.urlunparse(url._replace(**kwargs))
3013
3014
3015 def update_url_query(url, query):
3016 return update_url(url, query_update=query)
3017
3018
3019 def update_Request(req, url=None, data=None, headers=None, query=None):
3020 req_headers = req.headers.copy()
3021 req_headers.update(headers or {})
3022 req_data = data or req.data
3023 req_url = update_url_query(url or req.get_full_url(), query)
3024 req_get_method = req.get_method()
3025 if req_get_method == 'HEAD':
3026 req_type = HEADRequest
3027 elif req_get_method == 'PUT':
3028 req_type = PUTRequest
3029 else:
3030 req_type = urllib.request.Request
3031 new_req = req_type(
3032 req_url, data=req_data, headers=req_headers,
3033 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3034 if hasattr(req, 'timeout'):
3035 new_req.timeout = req.timeout
3036 return new_req
3037
3038
3039 def _multipart_encode_impl(data, boundary):
3040 content_type = 'multipart/form-data; boundary=%s' % boundary
3041
3042 out = b''
3043 for k, v in data.items():
3044 out += b'--' + boundary.encode('ascii') + b'\r\n'
3045 if isinstance(k, str):
3046 k = k.encode()
3047 if isinstance(v, str):
3048 v = v.encode()
3049 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3050 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3051 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3052 if boundary.encode('ascii') in content:
3053 raise ValueError('Boundary overlaps with data')
3054 out += content
3055
3056 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3057
3058 return out, content_type
3059
3060
3061 def multipart_encode(data, boundary=None):
3062 '''
3063 Encode a dict to RFC 7578-compliant form-data
3064
3065 data:
3066 A dict where keys and values can be either Unicode or bytes-like
3067 objects.
3068 boundary:
3069 If specified a Unicode object, it's used as the boundary. Otherwise
3070 a random boundary is generated.
3071
3072 Reference: https://tools.ietf.org/html/rfc7578
3073 '''
3074 has_specified_boundary = boundary is not None
3075
3076 while True:
3077 if boundary is None:
3078 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3079
3080 try:
3081 out, content_type = _multipart_encode_impl(data, boundary)
3082 break
3083 except ValueError:
3084 if has_specified_boundary:
3085 raise
3086 boundary = None
3087
3088 return out, content_type
3089
3090
3091 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3092 if blocked_types is NO_DEFAULT:
3093 blocked_types = (str, bytes, collections.abc.Mapping)
3094 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3095
3096
3097 def variadic(x, allowed_types=NO_DEFAULT):
3098 if not isinstance(allowed_types, (tuple, type)):
3099 deprecation_warning('allowed_types should be a tuple or a type')
3100 allowed_types = tuple(allowed_types)
3101 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3102
3103
3104 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3105 for f in funcs:
3106 try:
3107 val = f(*args, **kwargs)
3108 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3109 pass
3110 else:
3111 if expected_type is None or isinstance(val, expected_type):
3112 return val
3113
3114
3115 def try_get(src, getter, expected_type=None):
3116 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3117
3118
3119 def filter_dict(dct, cndn=lambda _, v: v is not None):
3120 return {k: v for k, v in dct.items() if cndn(k, v)}
3121
3122
3123 def merge_dicts(*dicts):
3124 merged = {}
3125 for a_dict in dicts:
3126 for k, v in a_dict.items():
3127 if (v is not None and k not in merged
3128 or isinstance(v, str) and merged[k] == ''):
3129 merged[k] = v
3130 return merged
3131
3132
3133 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3134 return string if isinstance(string, str) else str(string, encoding, errors)
3135
3136
3137 US_RATINGS = {
3138 'G': 0,
3139 'PG': 10,
3140 'PG-13': 13,
3141 'R': 16,
3142 'NC': 18,
3143 }
3144
3145
3146 TV_PARENTAL_GUIDELINES = {
3147 'TV-Y': 0,
3148 'TV-Y7': 7,
3149 'TV-G': 0,
3150 'TV-PG': 0,
3151 'TV-14': 14,
3152 'TV-MA': 17,
3153 }
3154
3155
3156 def parse_age_limit(s):
3157 # isinstance(False, int) is True. So type() must be used instead
3158 if type(s) is int: # noqa: E721
3159 return s if 0 <= s <= 21 else None
3160 elif not isinstance(s, str):
3161 return None
3162 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3163 if m:
3164 return int(m.group('age'))
3165 s = s.upper()
3166 if s in US_RATINGS:
3167 return US_RATINGS[s]
3168 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3169 if m:
3170 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3171 return None
3172
3173
3174 def strip_jsonp(code):
3175 return re.sub(
3176 r'''(?sx)^
3177 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3178 (?:\s*&&\s*(?P=func_name))?
3179 \s*\(\s*(?P<callback_data>.*)\);?
3180 \s*?(?://[^\n]*)*$''',
3181 r'\g<callback_data>', code)
3182
3183
3184 def js_to_json(code, vars={}, *, strict=False):
3185 # vars is a dict of var, val pairs to substitute
3186 STRING_QUOTES = '\'"`'
3187 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3188 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3189 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3190 INTEGER_TABLE = (
3191 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3192 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3193 )
3194
3195 def process_escape(match):
3196 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3197 escape = match.group(1) or match.group(2)
3198
3199 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3200 else R'\u00' if escape == 'x'
3201 else '' if escape == '\n'
3202 else escape)
3203
3204 def template_substitute(match):
3205 evaluated = js_to_json(match.group(1), vars, strict=strict)
3206 if evaluated[0] == '"':
3207 return json.loads(evaluated)
3208 return evaluated
3209
3210 def fix_kv(m):
3211 v = m.group(0)
3212 if v in ('true', 'false', 'null'):
3213 return v
3214 elif v in ('undefined', 'void 0'):
3215 return 'null'
3216 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3217 return ''
3218
3219 if v[0] in STRING_QUOTES:
3220 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3221 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3222 return f'"{escaped}"'
3223
3224 for regex, base in INTEGER_TABLE:
3225 im = re.match(regex, v)
3226 if im:
3227 i = int(im.group(1), base)
3228 return f'"{i}":' if v.endswith(':') else str(i)
3229
3230 if v in vars:
3231 try:
3232 if not strict:
3233 json.loads(vars[v])
3234 except json.JSONDecodeError:
3235 return json.dumps(vars[v])
3236 else:
3237 return vars[v]
3238
3239 if not strict:
3240 return f'"{v}"'
3241
3242 raise ValueError(f'Unknown value: {v}')
3243
3244 def create_map(mobj):
3245 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3246
3247 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3248 if not strict:
3249 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3250 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3251 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3252 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3253
3254 return re.sub(rf'''(?sx)
3255 {STRING_RE}|
3256 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3257 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3258 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3259 [0-9]+(?={SKIP_RE}:)|
3260 !+
3261 ''', fix_kv, code)
3262
3263
3264 def qualities(quality_ids):
3265 """ Get a numeric quality value out of a list of possible values """
3266 def q(qid):
3267 try:
3268 return quality_ids.index(qid)
3269 except ValueError:
3270 return -1
3271 return q
3272
3273
3274 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3275
3276
3277 DEFAULT_OUTTMPL = {
3278 'default': '%(title)s [%(id)s].%(ext)s',
3279 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3280 }
3281 OUTTMPL_TYPES = {
3282 'chapter': None,
3283 'subtitle': None,
3284 'thumbnail': None,
3285 'description': 'description',
3286 'annotation': 'annotations.xml',
3287 'infojson': 'info.json',
3288 'link': None,
3289 'pl_video': None,
3290 'pl_thumbnail': None,
3291 'pl_description': 'description',
3292 'pl_infojson': 'info.json',
3293 }
3294
3295 # As of [1] format syntax is:
3296 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3297 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3298 STR_FORMAT_RE_TMPL = r'''(?x)
3299 (?<!%)(?P<prefix>(?:%%)*)
3300 %
3301 (?P<has_key>\((?P<key>{0})\))?
3302 (?P<format>
3303 (?P<conversion>[#0\-+ ]+)?
3304 (?P<min_width>\d+)?
3305 (?P<precision>\.\d+)?
3306 (?P<len_mod>[hlL])? # unused in python
3307 {1} # conversion type
3308 )
3309 '''
3310
3311
3312 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
3313
3314
3315 def limit_length(s, length):
3316 """ Add ellipses to overly long strings """
3317 if s is None:
3318 return None
3319 ELLIPSES = '...'
3320 if len(s) > length:
3321 return s[:length - len(ELLIPSES)] + ELLIPSES
3322 return s
3323
3324
3325 def version_tuple(v):
3326 return tuple(int(e) for e in re.split(r'[-.]', v))
3327
3328
3329 def is_outdated_version(version, limit, assume_new=True):
3330 if not version:
3331 return not assume_new
3332 try:
3333 return version_tuple(version) < version_tuple(limit)
3334 except ValueError:
3335 return not assume_new
3336
3337
3338 def ytdl_is_updateable():
3339 """ Returns if yt-dlp can be updated with -U """
3340
3341 from ..update import is_non_updateable
3342
3343 return not is_non_updateable()
3344
3345
3346 def args_to_str(args):
3347 # Get a short string representation for a subprocess command
3348 return ' '.join(compat_shlex_quote(a) for a in args)
3349
3350
3351 def error_to_str(err):
3352 return f'{type(err).__name__}: {err}'
3353
3354
3355 def mimetype2ext(mt, default=NO_DEFAULT):
3356 if not isinstance(mt, str):
3357 if default is not NO_DEFAULT:
3358 return default
3359 return None
3360
3361 MAP = {
3362 # video
3363 '3gpp': '3gp',
3364 'mp2t': 'ts',
3365 'mp4': 'mp4',
3366 'mpeg': 'mpeg',
3367 'mpegurl': 'm3u8',
3368 'quicktime': 'mov',
3369 'webm': 'webm',
3370 'vp9': 'vp9',
3371 'x-flv': 'flv',
3372 'x-m4v': 'm4v',
3373 'x-matroska': 'mkv',
3374 'x-mng': 'mng',
3375 'x-mp4-fragmented': 'mp4',
3376 'x-ms-asf': 'asf',
3377 'x-ms-wmv': 'wmv',
3378 'x-msvideo': 'avi',
3379
3380 # application (streaming playlists)
3381 'dash+xml': 'mpd',
3382 'f4m+xml': 'f4m',
3383 'hds+xml': 'f4m',
3384 'vnd.apple.mpegurl': 'm3u8',
3385 'vnd.ms-sstr+xml': 'ism',
3386 'x-mpegurl': 'm3u8',
3387
3388 # audio
3389 'audio/mp4': 'm4a',
3390 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3391 # Using .mp3 as it's the most popular one
3392 'audio/mpeg': 'mp3',
3393 'audio/webm': 'webm',
3394 'audio/x-matroska': 'mka',
3395 'audio/x-mpegurl': 'm3u',
3396 'midi': 'mid',
3397 'ogg': 'ogg',
3398 'wav': 'wav',
3399 'wave': 'wav',
3400 'x-aac': 'aac',
3401 'x-flac': 'flac',
3402 'x-m4a': 'm4a',
3403 'x-realaudio': 'ra',
3404 'x-wav': 'wav',
3405
3406 # image
3407 'avif': 'avif',
3408 'bmp': 'bmp',
3409 'gif': 'gif',
3410 'jpeg': 'jpg',
3411 'png': 'png',
3412 'svg+xml': 'svg',
3413 'tiff': 'tif',
3414 'vnd.wap.wbmp': 'wbmp',
3415 'webp': 'webp',
3416 'x-icon': 'ico',
3417 'x-jng': 'jng',
3418 'x-ms-bmp': 'bmp',
3419
3420 # caption
3421 'filmstrip+json': 'fs',
3422 'smptett+xml': 'tt',
3423 'ttaf+xml': 'dfxp',
3424 'ttml+xml': 'ttml',
3425 'x-ms-sami': 'sami',
3426
3427 # misc
3428 'gzip': 'gz',
3429 'json': 'json',
3430 'xml': 'xml',
3431 'zip': 'zip',
3432 }
3433
3434 mimetype = mt.partition(';')[0].strip().lower()
3435 _, _, subtype = mimetype.rpartition('/')
3436
3437 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3438 if ext:
3439 return ext
3440 elif default is not NO_DEFAULT:
3441 return default
3442 return subtype.replace('+', '.')
3443
3444
3445 def ext2mimetype(ext_or_url):
3446 if not ext_or_url:
3447 return None
3448 if '.' not in ext_or_url:
3449 ext_or_url = f'file.{ext_or_url}'
3450 return mimetypes.guess_type(ext_or_url)[0]
3451
3452
3453 def parse_codecs(codecs_str):
3454 # http://tools.ietf.org/html/rfc6381
3455 if not codecs_str:
3456 return {}
3457 split_codecs = list(filter(None, map(
3458 str.strip, codecs_str.strip().strip(',').split(','))))
3459 vcodec, acodec, scodec, hdr = None, None, None, None
3460 for full_codec in split_codecs:
3461 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3462 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3463 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3464 if vcodec:
3465 continue
3466 vcodec = full_codec
3467 if parts[0] in ('dvh1', 'dvhe'):
3468 hdr = 'DV'
3469 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3470 hdr = 'HDR10'
3471 elif parts[:2] == ['vp9', '2']:
3472 hdr = 'HDR10'
3473 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3474 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3475 acodec = acodec or full_codec
3476 elif parts[0] in ('stpp', 'wvtt'):
3477 scodec = scodec or full_codec
3478 else:
3479 write_string(f'WARNING: Unknown codec {full_codec}\n')
3480 if vcodec or acodec or scodec:
3481 return {
3482 'vcodec': vcodec or 'none',
3483 'acodec': acodec or 'none',
3484 'dynamic_range': hdr,
3485 **({'scodec': scodec} if scodec is not None else {}),
3486 }
3487 elif len(split_codecs) == 2:
3488 return {
3489 'vcodec': split_codecs[0],
3490 'acodec': split_codecs[1],
3491 }
3492 return {}
3493
3494
3495 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3496 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3497
3498 allow_mkv = not preferences or 'mkv' in preferences
3499
3500 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3501 return 'mkv' # TODO: any other format allows this?
3502
3503 # TODO: All codecs supported by parse_codecs isn't handled here
3504 COMPATIBLE_CODECS = {
3505 'mp4': {
3506 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3507 'h264', 'aacl', 'ec-3', # Set in ISM
3508 },
3509 'webm': {
3510 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3511 'vp9x', 'vp8x', # in the webm spec
3512 },
3513 }
3514
3515 sanitize_codec = functools.partial(
3516 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3517 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3518
3519 for ext in preferences or COMPATIBLE_CODECS.keys():
3520 codec_set = COMPATIBLE_CODECS.get(ext, set())
3521 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3522 return ext
3523
3524 COMPATIBLE_EXTS = (
3525 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3526 {'webm', 'weba'},
3527 )
3528 for ext in preferences or vexts:
3529 current_exts = {ext, *vexts, *aexts}
3530 if ext == 'mkv' or current_exts == {ext} or any(
3531 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3532 return ext
3533 return 'mkv' if allow_mkv else preferences[-1]
3534
3535
3536 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3537 getheader = url_handle.headers.get
3538
3539 cd = getheader('Content-Disposition')
3540 if cd:
3541 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3542 if m:
3543 e = determine_ext(m.group('filename'), default_ext=None)
3544 if e:
3545 return e
3546
3547 meta_ext = getheader('x-amz-meta-name')
3548 if meta_ext:
3549 e = meta_ext.rpartition('.')[2]
3550 if e:
3551 return e
3552
3553 return mimetype2ext(getheader('Content-Type'), default=default)
3554
3555
3556 def encode_data_uri(data, mime_type):
3557 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3558
3559
3560 def age_restricted(content_limit, age_limit):
3561 """ Returns True iff the content should be blocked """
3562
3563 if age_limit is None: # No limit set
3564 return False
3565 if content_limit is None:
3566 return False # Content available for everyone
3567 return age_limit < content_limit
3568
3569
3570 # List of known byte-order-marks (BOM)
3571 BOMS = [
3572 (b'\xef\xbb\xbf', 'utf-8'),
3573 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3574 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3575 (b'\xff\xfe', 'utf-16-le'),
3576 (b'\xfe\xff', 'utf-16-be'),
3577 ]
3578
3579
3580 def is_html(first_bytes):
3581 """ Detect whether a file contains HTML by examining its first bytes. """
3582
3583 encoding = 'utf-8'
3584 for bom, enc in BOMS:
3585 while first_bytes.startswith(bom):
3586 encoding, first_bytes = enc, first_bytes[len(bom):]
3587
3588 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3589
3590
3591 def determine_protocol(info_dict):
3592 protocol = info_dict.get('protocol')
3593 if protocol is not None:
3594 return protocol
3595
3596 url = sanitize_url(info_dict['url'])
3597 if url.startswith('rtmp'):
3598 return 'rtmp'
3599 elif url.startswith('mms'):
3600 return 'mms'
3601 elif url.startswith('rtsp'):
3602 return 'rtsp'
3603
3604 ext = determine_ext(url)
3605 if ext == 'm3u8':
3606 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3607 elif ext == 'f4m':
3608 return 'f4m'
3609
3610 return urllib.parse.urlparse(url).scheme
3611
3612
3613 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3614 """ Render a list of rows, each as a list of values.
3615 Text after a \t will be right aligned """
3616 def width(string):
3617 return len(remove_terminal_sequences(string).replace('\t', ''))
3618
3619 def get_max_lens(table):
3620 return [max(width(str(v)) for v in col) for col in zip(*table)]
3621
3622 def filter_using_list(row, filterArray):
3623 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3624
3625 max_lens = get_max_lens(data) if hide_empty else []
3626 header_row = filter_using_list(header_row, max_lens)
3627 data = [filter_using_list(row, max_lens) for row in data]
3628
3629 table = [header_row] + data
3630 max_lens = get_max_lens(table)
3631 extra_gap += 1
3632 if delim:
3633 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3634 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3635 for row in table:
3636 for pos, text in enumerate(map(str, row)):
3637 if '\t' in text:
3638 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3639 else:
3640 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3641 ret = '\n'.join(''.join(row).rstrip() for row in table)
3642 return ret
3643
3644
3645 def _match_one(filter_part, dct, incomplete):
3646 # TODO: Generalize code with YoutubeDL._build_format_filter
3647 STRING_OPERATORS = {
3648 '*=': operator.contains,
3649 '^=': lambda attr, value: attr.startswith(value),
3650 '$=': lambda attr, value: attr.endswith(value),
3651 '~=': lambda attr, value: re.search(value, attr),
3652 }
3653 COMPARISON_OPERATORS = {
3654 **STRING_OPERATORS,
3655 '<=': operator.le, # "<=" must be defined above "<"
3656 '<': operator.lt,
3657 '>=': operator.ge,
3658 '>': operator.gt,
3659 '=': operator.eq,
3660 }
3661
3662 if isinstance(incomplete, bool):
3663 is_incomplete = lambda _: incomplete
3664 else:
3665 is_incomplete = lambda k: k in incomplete
3666
3667 operator_rex = re.compile(r'''(?x)
3668 (?P<key>[a-z_]+)
3669 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3670 (?:
3671 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3672 (?P<strval>.+?)
3673 )
3674 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3675 m = operator_rex.fullmatch(filter_part.strip())
3676 if m:
3677 m = m.groupdict()
3678 unnegated_op = COMPARISON_OPERATORS[m['op']]
3679 if m['negation']:
3680 op = lambda attr, value: not unnegated_op(attr, value)
3681 else:
3682 op = unnegated_op
3683 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3684 if m['quote']:
3685 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3686 actual_value = dct.get(m['key'])
3687 numeric_comparison = None
3688 if isinstance(actual_value, (int, float)):
3689 # If the original field is a string and matching comparisonvalue is
3690 # a number we should respect the origin of the original field
3691 # and process comparison value as a string (see
3692 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3693 try:
3694 numeric_comparison = int(comparison_value)
3695 except ValueError:
3696 numeric_comparison = parse_filesize(comparison_value)
3697 if numeric_comparison is None:
3698 numeric_comparison = parse_filesize(f'{comparison_value}B')
3699 if numeric_comparison is None:
3700 numeric_comparison = parse_duration(comparison_value)
3701 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3702 raise ValueError('Operator %s only supports string values!' % m['op'])
3703 if actual_value is None:
3704 return is_incomplete(m['key']) or m['none_inclusive']
3705 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3706
3707 UNARY_OPERATORS = {
3708 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3709 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3710 }
3711 operator_rex = re.compile(r'''(?x)
3712 (?P<op>%s)\s*(?P<key>[a-z_]+)
3713 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3714 m = operator_rex.fullmatch(filter_part.strip())
3715 if m:
3716 op = UNARY_OPERATORS[m.group('op')]
3717 actual_value = dct.get(m.group('key'))
3718 if is_incomplete(m.group('key')) and actual_value is None:
3719 return True
3720 return op(actual_value)
3721
3722 raise ValueError('Invalid filter part %r' % filter_part)
3723
3724
3725 def match_str(filter_str, dct, incomplete=False):
3726 """ Filter a dictionary with a simple string syntax.
3727 @returns Whether the filter passes
3728 @param incomplete Set of keys that is expected to be missing from dct.
3729 Can be True/False to indicate all/none of the keys may be missing.
3730 All conditions on incomplete keys pass if the key is missing
3731 """
3732 return all(
3733 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3734 for filter_part in re.split(r'(?<!\\)&', filter_str))
3735
3736
3737 def match_filter_func(filters, breaking_filters=None):
3738 if not filters and not breaking_filters:
3739 return None
3740 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3741 filters = set(variadic(filters or []))
3742
3743 interactive = '-' in filters
3744 if interactive:
3745 filters.remove('-')
3746
3747 def _match_func(info_dict, incomplete=False):
3748 ret = breaking_filters(info_dict, incomplete)
3749 if ret is not None:
3750 raise RejectedVideoReached(ret)
3751
3752 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3753 return NO_DEFAULT if interactive and not incomplete else None
3754 else:
3755 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3756 filter_str = ') | ('.join(map(str.strip, filters))
3757 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3758 return _match_func
3759
3760
3761 class download_range_func:
3762 def __init__(self, chapters, ranges, from_info=False):
3763 self.chapters, self.ranges, self.from_info = chapters, ranges, from_info
3764
3765 def __call__(self, info_dict, ydl):
3766
3767 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3768 else 'Cannot match chapters since chapter information is unavailable')
3769 for regex in self.chapters or []:
3770 for i, chapter in enumerate(info_dict.get('chapters') or []):
3771 if re.search(regex, chapter['title']):
3772 warning = None
3773 yield {**chapter, 'index': i}
3774 if self.chapters and warning:
3775 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3776
3777 for start, end in self.ranges or []:
3778 yield {
3779 'start_time': self._handle_negative_timestamp(start, info_dict),
3780 'end_time': self._handle_negative_timestamp(end, info_dict),
3781 }
3782
3783 if self.from_info and (info_dict.get('start_time') or info_dict.get('end_time')):
3784 yield {
3785 'start_time': info_dict.get('start_time') or 0,
3786 'end_time': info_dict.get('end_time') or float('inf'),
3787 }
3788 elif not self.ranges and not self.chapters:
3789 yield {}
3790
3791 @staticmethod
3792 def _handle_negative_timestamp(time, info):
3793 return max(info['duration'] + time, 0) if info.get('duration') and time < 0 else time
3794
3795 def __eq__(self, other):
3796 return (isinstance(other, download_range_func)
3797 and self.chapters == other.chapters and self.ranges == other.ranges)
3798
3799 def __repr__(self):
3800 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3801
3802
3803 def parse_dfxp_time_expr(time_expr):
3804 if not time_expr:
3805 return
3806
3807 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3808 if mobj:
3809 return float(mobj.group('time_offset'))
3810
3811 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3812 if mobj:
3813 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3814
3815
3816 def srt_subtitles_timecode(seconds):
3817 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3818
3819
3820 def ass_subtitles_timecode(seconds):
3821 time = timetuple_from_msec(seconds * 1000)
3822 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3823
3824
3825 def dfxp2srt(dfxp_data):
3826 '''
3827 @param dfxp_data A bytes-like object containing DFXP data
3828 @returns A unicode object containing converted SRT data
3829 '''
3830 LEGACY_NAMESPACES = (
3831 (b'http://www.w3.org/ns/ttml', [
3832 b'http://www.w3.org/2004/11/ttaf1',
3833 b'http://www.w3.org/2006/04/ttaf1',
3834 b'http://www.w3.org/2006/10/ttaf1',
3835 ]),
3836 (b'http://www.w3.org/ns/ttml#styling', [
3837 b'http://www.w3.org/ns/ttml#style',
3838 ]),
3839 )
3840
3841 SUPPORTED_STYLING = [
3842 'color',
3843 'fontFamily',
3844 'fontSize',
3845 'fontStyle',
3846 'fontWeight',
3847 'textDecoration'
3848 ]
3849
3850 _x = functools.partial(xpath_with_ns, ns_map={
3851 'xml': 'http://www.w3.org/XML/1998/namespace',
3852 'ttml': 'http://www.w3.org/ns/ttml',
3853 'tts': 'http://www.w3.org/ns/ttml#styling',
3854 })
3855
3856 styles = {}
3857 default_style = {}
3858
3859 class TTMLPElementParser:
3860 _out = ''
3861 _unclosed_elements = []
3862 _applied_styles = []
3863
3864 def start(self, tag, attrib):
3865 if tag in (_x('ttml:br'), 'br'):
3866 self._out += '\n'
3867 else:
3868 unclosed_elements = []
3869 style = {}
3870 element_style_id = attrib.get('style')
3871 if default_style:
3872 style.update(default_style)
3873 if element_style_id:
3874 style.update(styles.get(element_style_id, {}))
3875 for prop in SUPPORTED_STYLING:
3876 prop_val = attrib.get(_x('tts:' + prop))
3877 if prop_val:
3878 style[prop] = prop_val
3879 if style:
3880 font = ''
3881 for k, v in sorted(style.items()):
3882 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3883 continue
3884 if k == 'color':
3885 font += ' color="%s"' % v
3886 elif k == 'fontSize':
3887 font += ' size="%s"' % v
3888 elif k == 'fontFamily':
3889 font += ' face="%s"' % v
3890 elif k == 'fontWeight' and v == 'bold':
3891 self._out += '<b>'
3892 unclosed_elements.append('b')
3893 elif k == 'fontStyle' and v == 'italic':
3894 self._out += '<i>'
3895 unclosed_elements.append('i')
3896 elif k == 'textDecoration' and v == 'underline':
3897 self._out += '<u>'
3898 unclosed_elements.append('u')
3899 if font:
3900 self._out += '<font' + font + '>'
3901 unclosed_elements.append('font')
3902 applied_style = {}
3903 if self._applied_styles:
3904 applied_style.update(self._applied_styles[-1])
3905 applied_style.update(style)
3906 self._applied_styles.append(applied_style)
3907 self._unclosed_elements.append(unclosed_elements)
3908
3909 def end(self, tag):
3910 if tag not in (_x('ttml:br'), 'br'):
3911 unclosed_elements = self._unclosed_elements.pop()
3912 for element in reversed(unclosed_elements):
3913 self._out += '</%s>' % element
3914 if unclosed_elements and self._applied_styles:
3915 self._applied_styles.pop()
3916
3917 def data(self, data):
3918 self._out += data
3919
3920 def close(self):
3921 return self._out.strip()
3922
3923 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3924 # This will not trigger false positives since only UTF-8 text is being replaced
3925 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3926
3927 def parse_node(node):
3928 target = TTMLPElementParser()
3929 parser = xml.etree.ElementTree.XMLParser(target=target)
3930 parser.feed(xml.etree.ElementTree.tostring(node))
3931 return parser.close()
3932
3933 for k, v in LEGACY_NAMESPACES:
3934 for ns in v:
3935 dfxp_data = dfxp_data.replace(ns, k)
3936
3937 dfxp = compat_etree_fromstring(dfxp_data)
3938 out = []
3939 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3940
3941 if not paras:
3942 raise ValueError('Invalid dfxp/TTML subtitle')
3943
3944 repeat = False
3945 while True:
3946 for style in dfxp.findall(_x('.//ttml:style')):
3947 style_id = style.get('id') or style.get(_x('xml:id'))
3948 if not style_id:
3949 continue
3950 parent_style_id = style.get('style')
3951 if parent_style_id:
3952 if parent_style_id not in styles:
3953 repeat = True
3954 continue
3955 styles[style_id] = styles[parent_style_id].copy()
3956 for prop in SUPPORTED_STYLING:
3957 prop_val = style.get(_x('tts:' + prop))
3958 if prop_val:
3959 styles.setdefault(style_id, {})[prop] = prop_val
3960 if repeat:
3961 repeat = False
3962 else:
3963 break
3964
3965 for p in ('body', 'div'):
3966 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3967 if ele is None:
3968 continue
3969 style = styles.get(ele.get('style'))
3970 if not style:
3971 continue
3972 default_style.update(style)
3973
3974 for para, index in zip(paras, itertools.count(1)):
3975 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3976 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3977 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3978 if begin_time is None:
3979 continue
3980 if not end_time:
3981 if not dur:
3982 continue
3983 end_time = begin_time + dur
3984 out.append('%d\n%s --> %s\n%s\n\n' % (
3985 index,
3986 srt_subtitles_timecode(begin_time),
3987 srt_subtitles_timecode(end_time),
3988 parse_node(para)))
3989
3990 return ''.join(out)
3991
3992
3993 def cli_option(params, command_option, param, separator=None):
3994 param = params.get(param)
3995 return ([] if param is None
3996 else [command_option, str(param)] if separator is None
3997 else [f'{command_option}{separator}{param}'])
3998
3999
4000 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4001 param = params.get(param)
4002 assert param in (True, False, None)
4003 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4004
4005
4006 def cli_valueless_option(params, command_option, param, expected_value=True):
4007 return [command_option] if params.get(param) == expected_value else []
4008
4009
4010 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4011 if isinstance(argdict, (list, tuple)): # for backward compatibility
4012 if use_compat:
4013 return argdict
4014 else:
4015 argdict = None
4016 if argdict is None:
4017 return default
4018 assert isinstance(argdict, dict)
4019
4020 assert isinstance(keys, (list, tuple))
4021 for key_list in keys:
4022 arg_list = list(filter(
4023 lambda x: x is not None,
4024 [argdict.get(key.lower()) for key in variadic(key_list)]))
4025 if arg_list:
4026 return [arg for args in arg_list for arg in args]
4027 return default
4028
4029
4030 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4031 main_key, exe = main_key.lower(), exe.lower()
4032 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4033 keys = [f'{root_key}{k}' for k in (keys or [''])]
4034 if root_key in keys:
4035 if main_key != exe:
4036 keys.append((main_key, exe))
4037 keys.append('default')
4038 else:
4039 use_compat = False
4040 return cli_configuration_args(argdict, keys, default, use_compat)
4041
4042
4043 class ISO639Utils:
4044 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4045 _lang_map = {
4046 'aa': 'aar',
4047 'ab': 'abk',
4048 'ae': 'ave',
4049 'af': 'afr',
4050 'ak': 'aka',
4051 'am': 'amh',
4052 'an': 'arg',
4053 'ar': 'ara',
4054 'as': 'asm',
4055 'av': 'ava',
4056 'ay': 'aym',
4057 'az': 'aze',
4058 'ba': 'bak',
4059 'be': 'bel',
4060 'bg': 'bul',
4061 'bh': 'bih',
4062 'bi': 'bis',
4063 'bm': 'bam',
4064 'bn': 'ben',
4065 'bo': 'bod',
4066 'br': 'bre',
4067 'bs': 'bos',
4068 'ca': 'cat',
4069 'ce': 'che',
4070 'ch': 'cha',
4071 'co': 'cos',
4072 'cr': 'cre',
4073 'cs': 'ces',
4074 'cu': 'chu',
4075 'cv': 'chv',
4076 'cy': 'cym',
4077 'da': 'dan',
4078 'de': 'deu',
4079 'dv': 'div',
4080 'dz': 'dzo',
4081 'ee': 'ewe',
4082 'el': 'ell',
4083 'en': 'eng',
4084 'eo': 'epo',
4085 'es': 'spa',
4086 'et': 'est',
4087 'eu': 'eus',
4088 'fa': 'fas',
4089 'ff': 'ful',
4090 'fi': 'fin',
4091 'fj': 'fij',
4092 'fo': 'fao',
4093 'fr': 'fra',
4094 'fy': 'fry',
4095 'ga': 'gle',
4096 'gd': 'gla',
4097 'gl': 'glg',
4098 'gn': 'grn',
4099 'gu': 'guj',
4100 'gv': 'glv',
4101 'ha': 'hau',
4102 'he': 'heb',
4103 'iw': 'heb', # Replaced by he in 1989 revision
4104 'hi': 'hin',
4105 'ho': 'hmo',
4106 'hr': 'hrv',
4107 'ht': 'hat',
4108 'hu': 'hun',
4109 'hy': 'hye',
4110 'hz': 'her',
4111 'ia': 'ina',
4112 'id': 'ind',
4113 'in': 'ind', # Replaced by id in 1989 revision
4114 'ie': 'ile',
4115 'ig': 'ibo',
4116 'ii': 'iii',
4117 'ik': 'ipk',
4118 'io': 'ido',
4119 'is': 'isl',
4120 'it': 'ita',
4121 'iu': 'iku',
4122 'ja': 'jpn',
4123 'jv': 'jav',
4124 'ka': 'kat',
4125 'kg': 'kon',
4126 'ki': 'kik',
4127 'kj': 'kua',
4128 'kk': 'kaz',
4129 'kl': 'kal',
4130 'km': 'khm',
4131 'kn': 'kan',
4132 'ko': 'kor',
4133 'kr': 'kau',
4134 'ks': 'kas',
4135 'ku': 'kur',
4136 'kv': 'kom',
4137 'kw': 'cor',
4138 'ky': 'kir',
4139 'la': 'lat',
4140 'lb': 'ltz',
4141 'lg': 'lug',
4142 'li': 'lim',
4143 'ln': 'lin',
4144 'lo': 'lao',
4145 'lt': 'lit',
4146 'lu': 'lub',
4147 'lv': 'lav',
4148 'mg': 'mlg',
4149 'mh': 'mah',
4150 'mi': 'mri',
4151 'mk': 'mkd',
4152 'ml': 'mal',
4153 'mn': 'mon',
4154 'mr': 'mar',
4155 'ms': 'msa',
4156 'mt': 'mlt',
4157 'my': 'mya',
4158 'na': 'nau',
4159 'nb': 'nob',
4160 'nd': 'nde',
4161 'ne': 'nep',
4162 'ng': 'ndo',
4163 'nl': 'nld',
4164 'nn': 'nno',
4165 'no': 'nor',
4166 'nr': 'nbl',
4167 'nv': 'nav',
4168 'ny': 'nya',
4169 'oc': 'oci',
4170 'oj': 'oji',
4171 'om': 'orm',
4172 'or': 'ori',
4173 'os': 'oss',
4174 'pa': 'pan',
4175 'pe': 'per',
4176 'pi': 'pli',
4177 'pl': 'pol',
4178 'ps': 'pus',
4179 'pt': 'por',
4180 'qu': 'que',
4181 'rm': 'roh',
4182 'rn': 'run',
4183 'ro': 'ron',
4184 'ru': 'rus',
4185 'rw': 'kin',
4186 'sa': 'san',
4187 'sc': 'srd',
4188 'sd': 'snd',
4189 'se': 'sme',
4190 'sg': 'sag',
4191 'si': 'sin',
4192 'sk': 'slk',
4193 'sl': 'slv',
4194 'sm': 'smo',
4195 'sn': 'sna',
4196 'so': 'som',
4197 'sq': 'sqi',
4198 'sr': 'srp',
4199 'ss': 'ssw',
4200 'st': 'sot',
4201 'su': 'sun',
4202 'sv': 'swe',
4203 'sw': 'swa',
4204 'ta': 'tam',
4205 'te': 'tel',
4206 'tg': 'tgk',
4207 'th': 'tha',
4208 'ti': 'tir',
4209 'tk': 'tuk',
4210 'tl': 'tgl',
4211 'tn': 'tsn',
4212 'to': 'ton',
4213 'tr': 'tur',
4214 'ts': 'tso',
4215 'tt': 'tat',
4216 'tw': 'twi',
4217 'ty': 'tah',
4218 'ug': 'uig',
4219 'uk': 'ukr',
4220 'ur': 'urd',
4221 'uz': 'uzb',
4222 've': 'ven',
4223 'vi': 'vie',
4224 'vo': 'vol',
4225 'wa': 'wln',
4226 'wo': 'wol',
4227 'xh': 'xho',
4228 'yi': 'yid',
4229 'ji': 'yid', # Replaced by yi in 1989 revision
4230 'yo': 'yor',
4231 'za': 'zha',
4232 'zh': 'zho',
4233 'zu': 'zul',
4234 }
4235
4236 @classmethod
4237 def short2long(cls, code):
4238 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4239 return cls._lang_map.get(code[:2])
4240
4241 @classmethod
4242 def long2short(cls, code):
4243 """Convert language code from ISO 639-2/T to ISO 639-1"""
4244 for short_name, long_name in cls._lang_map.items():
4245 if long_name == code:
4246 return short_name
4247
4248
4249 class ISO3166Utils:
4250 # From http://data.okfn.org/data/core/country-list
4251 _country_map = {
4252 'AF': 'Afghanistan',
4253 'AX': 'Åland Islands',
4254 'AL': 'Albania',
4255 'DZ': 'Algeria',
4256 'AS': 'American Samoa',
4257 'AD': 'Andorra',
4258 'AO': 'Angola',
4259 'AI': 'Anguilla',
4260 'AQ': 'Antarctica',
4261 'AG': 'Antigua and Barbuda',
4262 'AR': 'Argentina',
4263 'AM': 'Armenia',
4264 'AW': 'Aruba',
4265 'AU': 'Australia',
4266 'AT': 'Austria',
4267 'AZ': 'Azerbaijan',
4268 'BS': 'Bahamas',
4269 'BH': 'Bahrain',
4270 'BD': 'Bangladesh',
4271 'BB': 'Barbados',
4272 'BY': 'Belarus',
4273 'BE': 'Belgium',
4274 'BZ': 'Belize',
4275 'BJ': 'Benin',
4276 'BM': 'Bermuda',
4277 'BT': 'Bhutan',
4278 'BO': 'Bolivia, Plurinational State of',
4279 'BQ': 'Bonaire, Sint Eustatius and Saba',
4280 'BA': 'Bosnia and Herzegovina',
4281 'BW': 'Botswana',
4282 'BV': 'Bouvet Island',
4283 'BR': 'Brazil',
4284 'IO': 'British Indian Ocean Territory',
4285 'BN': 'Brunei Darussalam',
4286 'BG': 'Bulgaria',
4287 'BF': 'Burkina Faso',
4288 'BI': 'Burundi',
4289 'KH': 'Cambodia',
4290 'CM': 'Cameroon',
4291 'CA': 'Canada',
4292 'CV': 'Cape Verde',
4293 'KY': 'Cayman Islands',
4294 'CF': 'Central African Republic',
4295 'TD': 'Chad',
4296 'CL': 'Chile',
4297 'CN': 'China',
4298 'CX': 'Christmas Island',
4299 'CC': 'Cocos (Keeling) Islands',
4300 'CO': 'Colombia',
4301 'KM': 'Comoros',
4302 'CG': 'Congo',
4303 'CD': 'Congo, the Democratic Republic of the',
4304 'CK': 'Cook Islands',
4305 'CR': 'Costa Rica',
4306 'CI': 'Côte d\'Ivoire',
4307 'HR': 'Croatia',
4308 'CU': 'Cuba',
4309 'CW': 'Curaçao',
4310 'CY': 'Cyprus',
4311 'CZ': 'Czech Republic',
4312 'DK': 'Denmark',
4313 'DJ': 'Djibouti',
4314 'DM': 'Dominica',
4315 'DO': 'Dominican Republic',
4316 'EC': 'Ecuador',
4317 'EG': 'Egypt',
4318 'SV': 'El Salvador',
4319 'GQ': 'Equatorial Guinea',
4320 'ER': 'Eritrea',
4321 'EE': 'Estonia',
4322 'ET': 'Ethiopia',
4323 'FK': 'Falkland Islands (Malvinas)',
4324 'FO': 'Faroe Islands',
4325 'FJ': 'Fiji',
4326 'FI': 'Finland',
4327 'FR': 'France',
4328 'GF': 'French Guiana',
4329 'PF': 'French Polynesia',
4330 'TF': 'French Southern Territories',
4331 'GA': 'Gabon',
4332 'GM': 'Gambia',
4333 'GE': 'Georgia',
4334 'DE': 'Germany',
4335 'GH': 'Ghana',
4336 'GI': 'Gibraltar',
4337 'GR': 'Greece',
4338 'GL': 'Greenland',
4339 'GD': 'Grenada',
4340 'GP': 'Guadeloupe',
4341 'GU': 'Guam',
4342 'GT': 'Guatemala',
4343 'GG': 'Guernsey',
4344 'GN': 'Guinea',
4345 'GW': 'Guinea-Bissau',
4346 'GY': 'Guyana',
4347 'HT': 'Haiti',
4348 'HM': 'Heard Island and McDonald Islands',
4349 'VA': 'Holy See (Vatican City State)',
4350 'HN': 'Honduras',
4351 'HK': 'Hong Kong',
4352 'HU': 'Hungary',
4353 'IS': 'Iceland',
4354 'IN': 'India',
4355 'ID': 'Indonesia',
4356 'IR': 'Iran, Islamic Republic of',
4357 'IQ': 'Iraq',
4358 'IE': 'Ireland',
4359 'IM': 'Isle of Man',
4360 'IL': 'Israel',
4361 'IT': 'Italy',
4362 'JM': 'Jamaica',
4363 'JP': 'Japan',
4364 'JE': 'Jersey',
4365 'JO': 'Jordan',
4366 'KZ': 'Kazakhstan',
4367 'KE': 'Kenya',
4368 'KI': 'Kiribati',
4369 'KP': 'Korea, Democratic People\'s Republic of',
4370 'KR': 'Korea, Republic of',
4371 'KW': 'Kuwait',
4372 'KG': 'Kyrgyzstan',
4373 'LA': 'Lao People\'s Democratic Republic',
4374 'LV': 'Latvia',
4375 'LB': 'Lebanon',
4376 'LS': 'Lesotho',
4377 'LR': 'Liberia',
4378 'LY': 'Libya',
4379 'LI': 'Liechtenstein',
4380 'LT': 'Lithuania',
4381 'LU': 'Luxembourg',
4382 'MO': 'Macao',
4383 'MK': 'Macedonia, the Former Yugoslav Republic of',
4384 'MG': 'Madagascar',
4385 'MW': 'Malawi',
4386 'MY': 'Malaysia',
4387 'MV': 'Maldives',
4388 'ML': 'Mali',
4389 'MT': 'Malta',
4390 'MH': 'Marshall Islands',
4391 'MQ': 'Martinique',
4392 'MR': 'Mauritania',
4393 'MU': 'Mauritius',
4394 'YT': 'Mayotte',
4395 'MX': 'Mexico',
4396 'FM': 'Micronesia, Federated States of',
4397 'MD': 'Moldova, Republic of',
4398 'MC': 'Monaco',
4399 'MN': 'Mongolia',
4400 'ME': 'Montenegro',
4401 'MS': 'Montserrat',
4402 'MA': 'Morocco',
4403 'MZ': 'Mozambique',
4404 'MM': 'Myanmar',
4405 'NA': 'Namibia',
4406 'NR': 'Nauru',
4407 'NP': 'Nepal',
4408 'NL': 'Netherlands',
4409 'NC': 'New Caledonia',
4410 'NZ': 'New Zealand',
4411 'NI': 'Nicaragua',
4412 'NE': 'Niger',
4413 'NG': 'Nigeria',
4414 'NU': 'Niue',
4415 'NF': 'Norfolk Island',
4416 'MP': 'Northern Mariana Islands',
4417 'NO': 'Norway',
4418 'OM': 'Oman',
4419 'PK': 'Pakistan',
4420 'PW': 'Palau',
4421 'PS': 'Palestine, State of',
4422 'PA': 'Panama',
4423 'PG': 'Papua New Guinea',
4424 'PY': 'Paraguay',
4425 'PE': 'Peru',
4426 'PH': 'Philippines',
4427 'PN': 'Pitcairn',
4428 'PL': 'Poland',
4429 'PT': 'Portugal',
4430 'PR': 'Puerto Rico',
4431 'QA': 'Qatar',
4432 'RE': 'Réunion',
4433 'RO': 'Romania',
4434 'RU': 'Russian Federation',
4435 'RW': 'Rwanda',
4436 'BL': 'Saint Barthélemy',
4437 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4438 'KN': 'Saint Kitts and Nevis',
4439 'LC': 'Saint Lucia',
4440 'MF': 'Saint Martin (French part)',
4441 'PM': 'Saint Pierre and Miquelon',
4442 'VC': 'Saint Vincent and the Grenadines',
4443 'WS': 'Samoa',
4444 'SM': 'San Marino',
4445 'ST': 'Sao Tome and Principe',
4446 'SA': 'Saudi Arabia',
4447 'SN': 'Senegal',
4448 'RS': 'Serbia',
4449 'SC': 'Seychelles',
4450 'SL': 'Sierra Leone',
4451 'SG': 'Singapore',
4452 'SX': 'Sint Maarten (Dutch part)',
4453 'SK': 'Slovakia',
4454 'SI': 'Slovenia',
4455 'SB': 'Solomon Islands',
4456 'SO': 'Somalia',
4457 'ZA': 'South Africa',
4458 'GS': 'South Georgia and the South Sandwich Islands',
4459 'SS': 'South Sudan',
4460 'ES': 'Spain',
4461 'LK': 'Sri Lanka',
4462 'SD': 'Sudan',
4463 'SR': 'Suriname',
4464 'SJ': 'Svalbard and Jan Mayen',
4465 'SZ': 'Swaziland',
4466 'SE': 'Sweden',
4467 'CH': 'Switzerland',
4468 'SY': 'Syrian Arab Republic',
4469 'TW': 'Taiwan, Province of China',
4470 'TJ': 'Tajikistan',
4471 'TZ': 'Tanzania, United Republic of',
4472 'TH': 'Thailand',
4473 'TL': 'Timor-Leste',
4474 'TG': 'Togo',
4475 'TK': 'Tokelau',
4476 'TO': 'Tonga',
4477 'TT': 'Trinidad and Tobago',
4478 'TN': 'Tunisia',
4479 'TR': 'Turkey',
4480 'TM': 'Turkmenistan',
4481 'TC': 'Turks and Caicos Islands',
4482 'TV': 'Tuvalu',
4483 'UG': 'Uganda',
4484 'UA': 'Ukraine',
4485 'AE': 'United Arab Emirates',
4486 'GB': 'United Kingdom',
4487 'US': 'United States',
4488 'UM': 'United States Minor Outlying Islands',
4489 'UY': 'Uruguay',
4490 'UZ': 'Uzbekistan',
4491 'VU': 'Vanuatu',
4492 'VE': 'Venezuela, Bolivarian Republic of',
4493 'VN': 'Viet Nam',
4494 'VG': 'Virgin Islands, British',
4495 'VI': 'Virgin Islands, U.S.',
4496 'WF': 'Wallis and Futuna',
4497 'EH': 'Western Sahara',
4498 'YE': 'Yemen',
4499 'ZM': 'Zambia',
4500 'ZW': 'Zimbabwe',
4501 # Not ISO 3166 codes, but used for IP blocks
4502 'AP': 'Asia/Pacific Region',
4503 'EU': 'Europe',
4504 }
4505
4506 @classmethod
4507 def short2full(cls, code):
4508 """Convert an ISO 3166-2 country code to the corresponding full name"""
4509 return cls._country_map.get(code.upper())
4510
4511
4512 class GeoUtils:
4513 # Major IPv4 address blocks per country
4514 _country_ip_map = {
4515 'AD': '46.172.224.0/19',
4516 'AE': '94.200.0.0/13',
4517 'AF': '149.54.0.0/17',
4518 'AG': '209.59.64.0/18',
4519 'AI': '204.14.248.0/21',
4520 'AL': '46.99.0.0/16',
4521 'AM': '46.70.0.0/15',
4522 'AO': '105.168.0.0/13',
4523 'AP': '182.50.184.0/21',
4524 'AQ': '23.154.160.0/24',
4525 'AR': '181.0.0.0/12',
4526 'AS': '202.70.112.0/20',
4527 'AT': '77.116.0.0/14',
4528 'AU': '1.128.0.0/11',
4529 'AW': '181.41.0.0/18',
4530 'AX': '185.217.4.0/22',
4531 'AZ': '5.197.0.0/16',
4532 'BA': '31.176.128.0/17',
4533 'BB': '65.48.128.0/17',
4534 'BD': '114.130.0.0/16',
4535 'BE': '57.0.0.0/8',
4536 'BF': '102.178.0.0/15',
4537 'BG': '95.42.0.0/15',
4538 'BH': '37.131.0.0/17',
4539 'BI': '154.117.192.0/18',
4540 'BJ': '137.255.0.0/16',
4541 'BL': '185.212.72.0/23',
4542 'BM': '196.12.64.0/18',
4543 'BN': '156.31.0.0/16',
4544 'BO': '161.56.0.0/16',
4545 'BQ': '161.0.80.0/20',
4546 'BR': '191.128.0.0/12',
4547 'BS': '24.51.64.0/18',
4548 'BT': '119.2.96.0/19',
4549 'BW': '168.167.0.0/16',
4550 'BY': '178.120.0.0/13',
4551 'BZ': '179.42.192.0/18',
4552 'CA': '99.224.0.0/11',
4553 'CD': '41.243.0.0/16',
4554 'CF': '197.242.176.0/21',
4555 'CG': '160.113.0.0/16',
4556 'CH': '85.0.0.0/13',
4557 'CI': '102.136.0.0/14',
4558 'CK': '202.65.32.0/19',
4559 'CL': '152.172.0.0/14',
4560 'CM': '102.244.0.0/14',
4561 'CN': '36.128.0.0/10',
4562 'CO': '181.240.0.0/12',
4563 'CR': '201.192.0.0/12',
4564 'CU': '152.206.0.0/15',
4565 'CV': '165.90.96.0/19',
4566 'CW': '190.88.128.0/17',
4567 'CY': '31.153.0.0/16',
4568 'CZ': '88.100.0.0/14',
4569 'DE': '53.0.0.0/8',
4570 'DJ': '197.241.0.0/17',
4571 'DK': '87.48.0.0/12',
4572 'DM': '192.243.48.0/20',
4573 'DO': '152.166.0.0/15',
4574 'DZ': '41.96.0.0/12',
4575 'EC': '186.68.0.0/15',
4576 'EE': '90.190.0.0/15',
4577 'EG': '156.160.0.0/11',
4578 'ER': '196.200.96.0/20',
4579 'ES': '88.0.0.0/11',
4580 'ET': '196.188.0.0/14',
4581 'EU': '2.16.0.0/13',
4582 'FI': '91.152.0.0/13',
4583 'FJ': '144.120.0.0/16',
4584 'FK': '80.73.208.0/21',
4585 'FM': '119.252.112.0/20',
4586 'FO': '88.85.32.0/19',
4587 'FR': '90.0.0.0/9',
4588 'GA': '41.158.0.0/15',
4589 'GB': '25.0.0.0/8',
4590 'GD': '74.122.88.0/21',
4591 'GE': '31.146.0.0/16',
4592 'GF': '161.22.64.0/18',
4593 'GG': '62.68.160.0/19',
4594 'GH': '154.160.0.0/12',
4595 'GI': '95.164.0.0/16',
4596 'GL': '88.83.0.0/19',
4597 'GM': '160.182.0.0/15',
4598 'GN': '197.149.192.0/18',
4599 'GP': '104.250.0.0/19',
4600 'GQ': '105.235.224.0/20',
4601 'GR': '94.64.0.0/13',
4602 'GT': '168.234.0.0/16',
4603 'GU': '168.123.0.0/16',
4604 'GW': '197.214.80.0/20',
4605 'GY': '181.41.64.0/18',
4606 'HK': '113.252.0.0/14',
4607 'HN': '181.210.0.0/16',
4608 'HR': '93.136.0.0/13',
4609 'HT': '148.102.128.0/17',
4610 'HU': '84.0.0.0/14',
4611 'ID': '39.192.0.0/10',
4612 'IE': '87.32.0.0/12',
4613 'IL': '79.176.0.0/13',
4614 'IM': '5.62.80.0/20',
4615 'IN': '117.192.0.0/10',
4616 'IO': '203.83.48.0/21',
4617 'IQ': '37.236.0.0/14',
4618 'IR': '2.176.0.0/12',
4619 'IS': '82.221.0.0/16',
4620 'IT': '79.0.0.0/10',
4621 'JE': '87.244.64.0/18',
4622 'JM': '72.27.0.0/17',
4623 'JO': '176.29.0.0/16',
4624 'JP': '133.0.0.0/8',
4625 'KE': '105.48.0.0/12',
4626 'KG': '158.181.128.0/17',
4627 'KH': '36.37.128.0/17',
4628 'KI': '103.25.140.0/22',
4629 'KM': '197.255.224.0/20',
4630 'KN': '198.167.192.0/19',
4631 'KP': '175.45.176.0/22',
4632 'KR': '175.192.0.0/10',
4633 'KW': '37.36.0.0/14',
4634 'KY': '64.96.0.0/15',
4635 'KZ': '2.72.0.0/13',
4636 'LA': '115.84.64.0/18',
4637 'LB': '178.135.0.0/16',
4638 'LC': '24.92.144.0/20',
4639 'LI': '82.117.0.0/19',
4640 'LK': '112.134.0.0/15',
4641 'LR': '102.183.0.0/16',
4642 'LS': '129.232.0.0/17',
4643 'LT': '78.56.0.0/13',
4644 'LU': '188.42.0.0/16',
4645 'LV': '46.109.0.0/16',
4646 'LY': '41.252.0.0/14',
4647 'MA': '105.128.0.0/11',
4648 'MC': '88.209.64.0/18',
4649 'MD': '37.246.0.0/16',
4650 'ME': '178.175.0.0/17',
4651 'MF': '74.112.232.0/21',
4652 'MG': '154.126.0.0/17',
4653 'MH': '117.103.88.0/21',
4654 'MK': '77.28.0.0/15',
4655 'ML': '154.118.128.0/18',
4656 'MM': '37.111.0.0/17',
4657 'MN': '49.0.128.0/17',
4658 'MO': '60.246.0.0/16',
4659 'MP': '202.88.64.0/20',
4660 'MQ': '109.203.224.0/19',
4661 'MR': '41.188.64.0/18',
4662 'MS': '208.90.112.0/22',
4663 'MT': '46.11.0.0/16',
4664 'MU': '105.16.0.0/12',
4665 'MV': '27.114.128.0/18',
4666 'MW': '102.70.0.0/15',
4667 'MX': '187.192.0.0/11',
4668 'MY': '175.136.0.0/13',
4669 'MZ': '197.218.0.0/15',
4670 'NA': '41.182.0.0/16',
4671 'NC': '101.101.0.0/18',
4672 'NE': '197.214.0.0/18',
4673 'NF': '203.17.240.0/22',
4674 'NG': '105.112.0.0/12',
4675 'NI': '186.76.0.0/15',
4676 'NL': '145.96.0.0/11',
4677 'NO': '84.208.0.0/13',
4678 'NP': '36.252.0.0/15',
4679 'NR': '203.98.224.0/19',
4680 'NU': '49.156.48.0/22',
4681 'NZ': '49.224.0.0/14',
4682 'OM': '5.36.0.0/15',
4683 'PA': '186.72.0.0/15',
4684 'PE': '186.160.0.0/14',
4685 'PF': '123.50.64.0/18',
4686 'PG': '124.240.192.0/19',
4687 'PH': '49.144.0.0/13',
4688 'PK': '39.32.0.0/11',
4689 'PL': '83.0.0.0/11',
4690 'PM': '70.36.0.0/20',
4691 'PR': '66.50.0.0/16',
4692 'PS': '188.161.0.0/16',
4693 'PT': '85.240.0.0/13',
4694 'PW': '202.124.224.0/20',
4695 'PY': '181.120.0.0/14',
4696 'QA': '37.210.0.0/15',
4697 'RE': '102.35.0.0/16',
4698 'RO': '79.112.0.0/13',
4699 'RS': '93.86.0.0/15',
4700 'RU': '5.136.0.0/13',
4701 'RW': '41.186.0.0/16',
4702 'SA': '188.48.0.0/13',
4703 'SB': '202.1.160.0/19',
4704 'SC': '154.192.0.0/11',
4705 'SD': '102.120.0.0/13',
4706 'SE': '78.64.0.0/12',
4707 'SG': '8.128.0.0/10',
4708 'SI': '188.196.0.0/14',
4709 'SK': '78.98.0.0/15',
4710 'SL': '102.143.0.0/17',
4711 'SM': '89.186.32.0/19',
4712 'SN': '41.82.0.0/15',
4713 'SO': '154.115.192.0/18',
4714 'SR': '186.179.128.0/17',
4715 'SS': '105.235.208.0/21',
4716 'ST': '197.159.160.0/19',
4717 'SV': '168.243.0.0/16',
4718 'SX': '190.102.0.0/20',
4719 'SY': '5.0.0.0/16',
4720 'SZ': '41.84.224.0/19',
4721 'TC': '65.255.48.0/20',
4722 'TD': '154.68.128.0/19',
4723 'TG': '196.168.0.0/14',
4724 'TH': '171.96.0.0/13',
4725 'TJ': '85.9.128.0/18',
4726 'TK': '27.96.24.0/21',
4727 'TL': '180.189.160.0/20',
4728 'TM': '95.85.96.0/19',
4729 'TN': '197.0.0.0/11',
4730 'TO': '175.176.144.0/21',
4731 'TR': '78.160.0.0/11',
4732 'TT': '186.44.0.0/15',
4733 'TV': '202.2.96.0/19',
4734 'TW': '120.96.0.0/11',
4735 'TZ': '156.156.0.0/14',
4736 'UA': '37.52.0.0/14',
4737 'UG': '102.80.0.0/13',
4738 'US': '6.0.0.0/8',
4739 'UY': '167.56.0.0/13',
4740 'UZ': '84.54.64.0/18',
4741 'VA': '212.77.0.0/19',
4742 'VC': '207.191.240.0/21',
4743 'VE': '186.88.0.0/13',
4744 'VG': '66.81.192.0/20',
4745 'VI': '146.226.0.0/16',
4746 'VN': '14.160.0.0/11',
4747 'VU': '202.80.32.0/20',
4748 'WF': '117.20.32.0/21',
4749 'WS': '202.4.32.0/19',
4750 'YE': '134.35.0.0/16',
4751 'YT': '41.242.116.0/22',
4752 'ZA': '41.0.0.0/11',
4753 'ZM': '102.144.0.0/13',
4754 'ZW': '102.177.192.0/18',
4755 }
4756
4757 @classmethod
4758 def random_ipv4(cls, code_or_block):
4759 if len(code_or_block) == 2:
4760 block = cls._country_ip_map.get(code_or_block.upper())
4761 if not block:
4762 return None
4763 else:
4764 block = code_or_block
4765 addr, preflen = block.split('/')
4766 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4767 addr_max = addr_min | (0xffffffff >> int(preflen))
4768 return str(socket.inet_ntoa(
4769 struct.pack('!L', random.randint(addr_min, addr_max))))
4770
4771
4772 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4773 def __init__(self, proxies=None):
4774 # Set default handlers
4775 for type in ('http', 'https'):
4776 setattr(self, '%s_open' % type,
4777 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4778 meth(r, proxy, type))
4779 urllib.request.ProxyHandler.__init__(self, proxies)
4780
4781 def proxy_open(self, req, proxy, type):
4782 req_proxy = req.headers.get('Ytdl-request-proxy')
4783 if req_proxy is not None:
4784 proxy = req_proxy
4785 del req.headers['Ytdl-request-proxy']
4786
4787 if proxy == '__noproxy__':
4788 return None # No Proxy
4789 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4790 req.add_header('Ytdl-socks-proxy', proxy)
4791 # yt-dlp's http/https handlers do wrapping the socket with socks
4792 return None
4793 return urllib.request.ProxyHandler.proxy_open(
4794 self, req, proxy, type)
4795
4796
4797 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4798 # released into Public Domain
4799 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4800
4801 def long_to_bytes(n, blocksize=0):
4802 """long_to_bytes(n:long, blocksize:int) : string
4803 Convert a long integer to a byte string.
4804
4805 If optional blocksize is given and greater than zero, pad the front of the
4806 byte string with binary zeros so that the length is a multiple of
4807 blocksize.
4808 """
4809 # after much testing, this algorithm was deemed to be the fastest
4810 s = b''
4811 n = int(n)
4812 while n > 0:
4813 s = struct.pack('>I', n & 0xffffffff) + s
4814 n = n >> 32
4815 # strip off leading zeros
4816 for i in range(len(s)):
4817 if s[i] != b'\000'[0]:
4818 break
4819 else:
4820 # only happens when n == 0
4821 s = b'\000'
4822 i = 0
4823 s = s[i:]
4824 # add back some pad bytes. this could be done more efficiently w.r.t. the
4825 # de-padding being done above, but sigh...
4826 if blocksize > 0 and len(s) % blocksize:
4827 s = (blocksize - len(s) % blocksize) * b'\000' + s
4828 return s
4829
4830
4831 def bytes_to_long(s):
4832 """bytes_to_long(string) : long
4833 Convert a byte string to a long integer.
4834
4835 This is (essentially) the inverse of long_to_bytes().
4836 """
4837 acc = 0
4838 length = len(s)
4839 if length % 4:
4840 extra = (4 - length % 4)
4841 s = b'\000' * extra + s
4842 length = length + extra
4843 for i in range(0, length, 4):
4844 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4845 return acc
4846
4847
4848 def ohdave_rsa_encrypt(data, exponent, modulus):
4849 '''
4850 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4851
4852 Input:
4853 data: data to encrypt, bytes-like object
4854 exponent, modulus: parameter e and N of RSA algorithm, both integer
4855 Output: hex string of encrypted data
4856
4857 Limitation: supports one block encryption only
4858 '''
4859
4860 payload = int(binascii.hexlify(data[::-1]), 16)
4861 encrypted = pow(payload, exponent, modulus)
4862 return '%x' % encrypted
4863
4864
4865 def pkcs1pad(data, length):
4866 """
4867 Padding input data with PKCS#1 scheme
4868
4869 @param {int[]} data input data
4870 @param {int} length target length
4871 @returns {int[]} padded data
4872 """
4873 if len(data) > length - 11:
4874 raise ValueError('Input data too long for PKCS#1 padding')
4875
4876 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4877 return [0, 2] + pseudo_random + [0] + data
4878
4879
4880 def _base_n_table(n, table):
4881 if not table and not n:
4882 raise ValueError('Either table or n must be specified')
4883 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4884
4885 if n and n != len(table):
4886 raise ValueError(f'base {n} exceeds table length {len(table)}')
4887 return table
4888
4889
4890 def encode_base_n(num, n=None, table=None):
4891 """Convert given int to a base-n string"""
4892 table = _base_n_table(n, table)
4893 if not num:
4894 return table[0]
4895
4896 result, base = '', len(table)
4897 while num:
4898 result = table[num % base] + result
4899 num = num // base
4900 return result
4901
4902
4903 def decode_base_n(string, n=None, table=None):
4904 """Convert given base-n string to int"""
4905 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4906 result, base = 0, len(table)
4907 for char in string:
4908 result = result * base + table[char]
4909 return result
4910
4911
4912 def decode_packed_codes(code):
4913 mobj = re.search(PACKED_CODES_RE, code)
4914 obfuscated_code, base, count, symbols = mobj.groups()
4915 base = int(base)
4916 count = int(count)
4917 symbols = symbols.split('|')
4918 symbol_table = {}
4919
4920 while count:
4921 count -= 1
4922 base_n_count = encode_base_n(count, base)
4923 symbol_table[base_n_count] = symbols[count] or base_n_count
4924
4925 return re.sub(
4926 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4927 obfuscated_code)
4928
4929
4930 def caesar(s, alphabet, shift):
4931 if shift == 0:
4932 return s
4933 l = len(alphabet)
4934 return ''.join(
4935 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4936 for c in s)
4937
4938
4939 def rot47(s):
4940 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4941
4942
4943 def parse_m3u8_attributes(attrib):
4944 info = {}
4945 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4946 if val.startswith('"'):
4947 val = val[1:-1]
4948 info[key] = val
4949 return info
4950
4951
4952 def urshift(val, n):
4953 return val >> n if val >= 0 else (val + 0x100000000) >> n
4954
4955
4956 def write_xattr(path, key, value):
4957 # Windows: Write xattrs to NTFS Alternate Data Streams:
4958 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4959 if compat_os_name == 'nt':
4960 assert ':' not in key
4961 assert os.path.exists(path)
4962
4963 try:
4964 with open(f'{path}:{key}', 'wb') as f:
4965 f.write(value)
4966 except OSError as e:
4967 raise XAttrMetadataError(e.errno, e.strerror)
4968 return
4969
4970 # UNIX Method 1. Use xattrs/pyxattrs modules
4971
4972 setxattr = None
4973 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4974 # Unicode arguments are not supported in pyxattr until version 0.5.0
4975 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4976 if version_tuple(xattr.__version__) >= (0, 5, 0):
4977 setxattr = xattr.set
4978 elif xattr:
4979 setxattr = xattr.setxattr
4980
4981 if setxattr:
4982 try:
4983 setxattr(path, key, value)
4984 except OSError as e:
4985 raise XAttrMetadataError(e.errno, e.strerror)
4986 return
4987
4988 # UNIX Method 2. Use setfattr/xattr executables
4989 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4990 else 'xattr' if check_executable('xattr', ['-h']) else None)
4991 if not exe:
4992 raise XAttrUnavailableError(
4993 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4994 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4995
4996 value = value.decode()
4997 try:
4998 _, stderr, returncode = Popen.run(
4999 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5000 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5001 except OSError as e:
5002 raise XAttrMetadataError(e.errno, e.strerror)
5003 if returncode:
5004 raise XAttrMetadataError(returncode, stderr)
5005
5006
5007 def random_birthday(year_field, month_field, day_field):
5008 start_date = datetime.date(1950, 1, 1)
5009 end_date = datetime.date(1995, 12, 31)
5010 offset = random.randint(0, (end_date - start_date).days)
5011 random_date = start_date + datetime.timedelta(offset)
5012 return {
5013 year_field: str(random_date.year),
5014 month_field: str(random_date.month),
5015 day_field: str(random_date.day),
5016 }
5017
5018
5019 def find_available_port(interface=''):
5020 try:
5021 with socket.socket() as sock:
5022 sock.bind((interface, 0))
5023 return sock.getsockname()[1]
5024 except OSError:
5025 return None
5026
5027
5028 # Templates for internet shortcut files, which are plain text files.
5029 DOT_URL_LINK_TEMPLATE = '''\
5030 [InternetShortcut]
5031 URL=%(url)s
5032 '''
5033
5034 DOT_WEBLOC_LINK_TEMPLATE = '''\
5035 <?xml version="1.0" encoding="UTF-8"?>
5036 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5037 <plist version="1.0">
5038 <dict>
5039 \t<key>URL</key>
5040 \t<string>%(url)s</string>
5041 </dict>
5042 </plist>
5043 '''
5044
5045 DOT_DESKTOP_LINK_TEMPLATE = '''\
5046 [Desktop Entry]
5047 Encoding=UTF-8
5048 Name=%(filename)s
5049 Type=Link
5050 URL=%(url)s
5051 Icon=text-html
5052 '''
5053
5054 LINK_TEMPLATES = {
5055 'url': DOT_URL_LINK_TEMPLATE,
5056 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5057 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5058 }
5059
5060
5061 def iri_to_uri(iri):
5062 """
5063 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5064
5065 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5066 """
5067
5068 iri_parts = urllib.parse.urlparse(iri)
5069
5070 if '[' in iri_parts.netloc:
5071 raise ValueError('IPv6 URIs are not, yet, supported.')
5072 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5073
5074 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5075
5076 net_location = ''
5077 if iri_parts.username:
5078 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5079 if iri_parts.password is not None:
5080 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5081 net_location += '@'
5082
5083 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5084 # The 'idna' encoding produces ASCII text.
5085 if iri_parts.port is not None and iri_parts.port != 80:
5086 net_location += ':' + str(iri_parts.port)
5087
5088 return urllib.parse.urlunparse(
5089 (iri_parts.scheme,
5090 net_location,
5091
5092 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5093
5094 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5095 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5096
5097 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5098 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5099
5100 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5101
5102 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5103
5104
5105 def to_high_limit_path(path):
5106 if sys.platform in ['win32', 'cygwin']:
5107 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5108 return '\\\\?\\' + os.path.abspath(path)
5109
5110 return path
5111
5112
5113 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5114 val = traversal.traverse_obj(obj, *variadic(field))
5115 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5116 return default
5117 return template % func(val)
5118
5119
5120 def clean_podcast_url(url):
5121 url = re.sub(r'''(?x)
5122 (?:
5123 (?:
5124 chtbl\.com/track|
5125 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5126 play\.podtrac\.com|
5127 chrt\.fm/track|
5128 mgln\.ai/e
5129 )(?:/[^/.]+)?|
5130 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5131 flex\.acast\.com|
5132 pd(?:
5133 cn\.co| # https://podcorn.com/analytics-prefix/
5134 st\.fm # https://podsights.com/docs/
5135 )/e|
5136 [0-9]\.gum\.fm|
5137 pscrb\.fm/rss/p
5138 )/''', '', url)
5139 return re.sub(r'^\w+://(\w+://)', r'\1', url)
5140
5141
5142 _HEX_TABLE = '0123456789abcdef'
5143
5144
5145 def random_uuidv4():
5146 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5147
5148
5149 def make_dir(path, to_screen=None):
5150 try:
5151 dn = os.path.dirname(path)
5152 if dn:
5153 os.makedirs(dn, exist_ok=True)
5154 return True
5155 except OSError as err:
5156 if callable(to_screen) is not None:
5157 to_screen(f'unable to create directory {err}')
5158 return False
5159
5160
5161 def get_executable_path():
5162 from ..update import _get_variant_and_executable_path
5163
5164 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5165
5166
5167 def get_user_config_dirs(package_name):
5168 # .config (e.g. ~/.config/package_name)
5169 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5170 yield os.path.join(xdg_config_home, package_name)
5171
5172 # appdata (%APPDATA%/package_name)
5173 appdata_dir = os.getenv('appdata')
5174 if appdata_dir:
5175 yield os.path.join(appdata_dir, package_name)
5176
5177 # home (~/.package_name)
5178 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5179
5180
5181 def get_system_config_dirs(package_name):
5182 # /etc/package_name
5183 yield os.path.join('/etc', package_name)
5184
5185
5186 def time_seconds(**kwargs):
5187 """
5188 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5189 """
5190 return time.time() + datetime.timedelta(**kwargs).total_seconds()
5191
5192
5193 # create a JSON Web Signature (jws) with HS256 algorithm
5194 # the resulting format is in JWS Compact Serialization
5195 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5196 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5197 def jwt_encode_hs256(payload_data, key, headers={}):
5198 header_data = {
5199 'alg': 'HS256',
5200 'typ': 'JWT',
5201 }
5202 if headers:
5203 header_data.update(headers)
5204 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5205 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5206 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5207 signature_b64 = base64.b64encode(h.digest())
5208 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5209 return token
5210
5211
5212 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5213 def jwt_decode_hs256(jwt):
5214 header_b64, payload_b64, signature_b64 = jwt.split('.')
5215 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5216 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5217 return payload_data
5218
5219
5220 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5221
5222
5223 @functools.cache
5224 def supports_terminal_sequences(stream):
5225 if compat_os_name == 'nt':
5226 if not WINDOWS_VT_MODE:
5227 return False
5228 elif not os.getenv('TERM'):
5229 return False
5230 try:
5231 return stream.isatty()
5232 except BaseException:
5233 return False
5234
5235
5236 def windows_enable_vt_mode():
5237 """Ref: https://bugs.python.org/issue30075 """
5238 if get_windows_version() < (10, 0, 10586):
5239 return
5240
5241 import ctypes
5242 import ctypes.wintypes
5243 import msvcrt
5244
5245 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5246
5247 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5248 handle = os.open('CONOUT$', os.O_RDWR)
5249 try:
5250 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5251 dw_original_mode = ctypes.wintypes.DWORD()
5252 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5253 if not success:
5254 raise Exception('GetConsoleMode failed')
5255
5256 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5257 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5258 if not success:
5259 raise Exception('SetConsoleMode failed')
5260 finally:
5261 os.close(handle)
5262
5263 global WINDOWS_VT_MODE
5264 WINDOWS_VT_MODE = True
5265 supports_terminal_sequences.cache_clear()
5266
5267
5268 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5269
5270
5271 def remove_terminal_sequences(string):
5272 return _terminal_sequences_re.sub('', string)
5273
5274
5275 def number_of_digits(number):
5276 return len('%d' % number)
5277
5278
5279 def join_nonempty(*values, delim='-', from_dict=None):
5280 if from_dict is not None:
5281 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5282 return delim.join(map(str, filter(None, values)))
5283
5284
5285 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5286 """
5287 Find the largest format dimensions in terms of video width and, for each thumbnail:
5288 * Modify the URL: Match the width with the provided regex and replace with the former width
5289 * Update dimensions
5290
5291 This function is useful with video services that scale the provided thumbnails on demand
5292 """
5293 _keys = ('width', 'height')
5294 max_dimensions = max(
5295 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5296 default=(0, 0))
5297 if not max_dimensions[0]:
5298 return thumbnails
5299 return [
5300 merge_dicts(
5301 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5302 dict(zip(_keys, max_dimensions)), thumbnail)
5303 for thumbnail in thumbnails
5304 ]
5305
5306
5307 def parse_http_range(range):
5308 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5309 if not range:
5310 return None, None, None
5311 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5312 if not crg:
5313 return None, None, None
5314 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5315
5316
5317 def read_stdin(what):
5318 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5319 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5320 return sys.stdin
5321
5322
5323 def determine_file_encoding(data):
5324 """
5325 Detect the text encoding used
5326 @returns (encoding, bytes to skip)
5327 """
5328
5329 # BOM marks are given priority over declarations
5330 for bom, enc in BOMS:
5331 if data.startswith(bom):
5332 return enc, len(bom)
5333
5334 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5335 # We ignore the endianness to get a good enough match
5336 data = data.replace(b'\0', b'')
5337 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5338 return mobj.group(1).decode() if mobj else None, 0
5339
5340
5341 class Config:
5342 own_args = None
5343 parsed_args = None
5344 filename = None
5345 __initialized = False
5346
5347 def __init__(self, parser, label=None):
5348 self.parser, self.label = parser, label
5349 self._loaded_paths, self.configs = set(), []
5350
5351 def init(self, args=None, filename=None):
5352 assert not self.__initialized
5353 self.own_args, self.filename = args, filename
5354 return self.load_configs()
5355
5356 def load_configs(self):
5357 directory = ''
5358 if self.filename:
5359 location = os.path.realpath(self.filename)
5360 directory = os.path.dirname(location)
5361 if location in self._loaded_paths:
5362 return False
5363 self._loaded_paths.add(location)
5364
5365 self.__initialized = True
5366 opts, _ = self.parser.parse_known_args(self.own_args)
5367 self.parsed_args = self.own_args
5368 for location in opts.config_locations or []:
5369 if location == '-':
5370 if location in self._loaded_paths:
5371 continue
5372 self._loaded_paths.add(location)
5373 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5374 continue
5375 location = os.path.join(directory, expand_path(location))
5376 if os.path.isdir(location):
5377 location = os.path.join(location, 'yt-dlp.conf')
5378 if not os.path.exists(location):
5379 self.parser.error(f'config location {location} does not exist')
5380 self.append_config(self.read_file(location), location)
5381 return True
5382
5383 def __str__(self):
5384 label = join_nonempty(
5385 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5386 delim=' ')
5387 return join_nonempty(
5388 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5389 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5390 delim='\n')
5391
5392 @staticmethod
5393 def read_file(filename, default=[]):
5394 try:
5395 optionf = open(filename, 'rb')
5396 except OSError:
5397 return default # silently skip if file is not present
5398 try:
5399 enc, skip = determine_file_encoding(optionf.read(512))
5400 optionf.seek(skip, io.SEEK_SET)
5401 except OSError:
5402 enc = None # silently skip read errors
5403 try:
5404 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5405 contents = optionf.read().decode(enc or preferredencoding())
5406 res = shlex.split(contents, comments=True)
5407 except Exception as err:
5408 raise ValueError(f'Unable to parse "{filename}": {err}')
5409 finally:
5410 optionf.close()
5411 return res
5412
5413 @staticmethod
5414 def hide_login_info(opts):
5415 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5416 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5417
5418 def _scrub_eq(o):
5419 m = eqre.match(o)
5420 if m:
5421 return m.group('key') + '=PRIVATE'
5422 else:
5423 return o
5424
5425 opts = list(map(_scrub_eq, opts))
5426 for idx, opt in enumerate(opts):
5427 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5428 opts[idx + 1] = 'PRIVATE'
5429 return opts
5430
5431 def append_config(self, *args, label=None):
5432 config = type(self)(self.parser, label)
5433 config._loaded_paths = self._loaded_paths
5434 if config.init(*args):
5435 self.configs.append(config)
5436
5437 @property
5438 def all_args(self):
5439 for config in reversed(self.configs):
5440 yield from config.all_args
5441 yield from self.parsed_args or []
5442
5443 def parse_known_args(self, **kwargs):
5444 return self.parser.parse_known_args(self.all_args, **kwargs)
5445
5446 def parse_args(self):
5447 return self.parser.parse_args(self.all_args)
5448
5449
5450 class WebSocketsWrapper:
5451 """Wraps websockets module to use in non-async scopes"""
5452 pool = None
5453
5454 def __init__(self, url, headers=None, connect=True):
5455 self.loop = asyncio.new_event_loop()
5456 # XXX: "loop" is deprecated
5457 self.conn = websockets.connect(
5458 url, extra_headers=headers, ping_interval=None,
5459 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5460 if connect:
5461 self.__enter__()
5462 atexit.register(self.__exit__, None, None, None)
5463
5464 def __enter__(self):
5465 if not self.pool:
5466 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5467 return self
5468
5469 def send(self, *args):
5470 self.run_with_loop(self.pool.send(*args), self.loop)
5471
5472 def recv(self, *args):
5473 return self.run_with_loop(self.pool.recv(*args), self.loop)
5474
5475 def __exit__(self, type, value, traceback):
5476 try:
5477 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5478 finally:
5479 self.loop.close()
5480 self._cancel_all_tasks(self.loop)
5481
5482 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5483 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5484 @staticmethod
5485 def run_with_loop(main, loop):
5486 if not asyncio.iscoroutine(main):
5487 raise ValueError(f'a coroutine was expected, got {main!r}')
5488
5489 try:
5490 return loop.run_until_complete(main)
5491 finally:
5492 loop.run_until_complete(loop.shutdown_asyncgens())
5493 if hasattr(loop, 'shutdown_default_executor'):
5494 loop.run_until_complete(loop.shutdown_default_executor())
5495
5496 @staticmethod
5497 def _cancel_all_tasks(loop):
5498 to_cancel = asyncio.all_tasks(loop)
5499
5500 if not to_cancel:
5501 return
5502
5503 for task in to_cancel:
5504 task.cancel()
5505
5506 # XXX: "loop" is removed in python 3.10+
5507 loop.run_until_complete(
5508 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5509
5510 for task in to_cancel:
5511 if task.cancelled():
5512 continue
5513 if task.exception() is not None:
5514 loop.call_exception_handler({
5515 'message': 'unhandled exception during asyncio.run() shutdown',
5516 'exception': task.exception(),
5517 'task': task,
5518 })
5519
5520
5521 def merge_headers(*dicts):
5522 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5523 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5524
5525
5526 def cached_method(f):
5527 """Cache a method"""
5528 signature = inspect.signature(f)
5529
5530 @functools.wraps(f)
5531 def wrapper(self, *args, **kwargs):
5532 bound_args = signature.bind(self, *args, **kwargs)
5533 bound_args.apply_defaults()
5534 key = tuple(bound_args.arguments.values())[1:]
5535
5536 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5537 if key not in cache:
5538 cache[key] = f(self, *args, **kwargs)
5539 return cache[key]
5540 return wrapper
5541
5542
5543 class classproperty:
5544 """property access for class methods with optional caching"""
5545 def __new__(cls, func=None, *args, **kwargs):
5546 if not func:
5547 return functools.partial(cls, *args, **kwargs)
5548 return super().__new__(cls)
5549
5550 def __init__(self, func, *, cache=False):
5551 functools.update_wrapper(self, func)
5552 self.func = func
5553 self._cache = {} if cache else None
5554
5555 def __get__(self, _, cls):
5556 if self._cache is None:
5557 return self.func(cls)
5558 elif cls not in self._cache:
5559 self._cache[cls] = self.func(cls)
5560 return self._cache[cls]
5561
5562
5563 class function_with_repr:
5564 def __init__(self, func, repr_=None):
5565 functools.update_wrapper(self, func)
5566 self.func, self.__repr = func, repr_
5567
5568 def __call__(self, *args, **kwargs):
5569 return self.func(*args, **kwargs)
5570
5571 def __repr__(self):
5572 if self.__repr:
5573 return self.__repr
5574 return f'{self.func.__module__}.{self.func.__qualname__}'
5575
5576
5577 class Namespace(types.SimpleNamespace):
5578 """Immutable namespace"""
5579
5580 def __iter__(self):
5581 return iter(self.__dict__.values())
5582
5583 @property
5584 def items_(self):
5585 return self.__dict__.items()
5586
5587
5588 MEDIA_EXTENSIONS = Namespace(
5589 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5590 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5591 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5592 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5593 thumbnails=('jpg', 'png', 'webp'),
5594 storyboards=('mhtml', ),
5595 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5596 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5597 )
5598 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5599 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5600
5601 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5602
5603
5604 class RetryManager:
5605 """Usage:
5606 for retry in RetryManager(...):
5607 try:
5608 ...
5609 except SomeException as err:
5610 retry.error = err
5611 continue
5612 """
5613 attempt, _error = 0, None
5614
5615 def __init__(self, _retries, _error_callback, **kwargs):
5616 self.retries = _retries or 0
5617 self.error_callback = functools.partial(_error_callback, **kwargs)
5618
5619 def _should_retry(self):
5620 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5621
5622 @property
5623 def error(self):
5624 if self._error is NO_DEFAULT:
5625 return None
5626 return self._error
5627
5628 @error.setter
5629 def error(self, value):
5630 self._error = value
5631
5632 def __iter__(self):
5633 while self._should_retry():
5634 self.error = NO_DEFAULT
5635 self.attempt += 1
5636 yield self
5637 if self.error:
5638 self.error_callback(self.error, self.attempt, self.retries)
5639
5640 @staticmethod
5641 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5642 """Utility function for reporting retries"""
5643 if count > retries:
5644 if error:
5645 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5646 raise e
5647
5648 if not count:
5649 return warn(e)
5650 elif isinstance(e, ExtractorError):
5651 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5652 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5653
5654 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5655 if delay:
5656 info(f'Sleeping {delay:.2f} seconds ...')
5657 time.sleep(delay)
5658
5659
5660 def make_archive_id(ie, video_id):
5661 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5662 return f'{ie_key.lower()} {video_id}'
5663
5664
5665 def truncate_string(s, left, right=0):
5666 assert left > 3 and right >= 0
5667 if s is None or len(s) <= left + right:
5668 return s
5669 return f'{s[:left-3]}...{s[-right:] if right else ""}'
5670
5671
5672 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5673 assert 'all' in alias_dict, '"all" alias is required'
5674 requested = list(start or [])
5675 for val in options:
5676 discard = val.startswith('-')
5677 if discard:
5678 val = val[1:]
5679
5680 if val in alias_dict:
5681 val = alias_dict[val] if not discard else [
5682 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5683 # NB: Do not allow regex in aliases for performance
5684 requested = orderedSet_from_options(val, alias_dict, start=requested)
5685 continue
5686
5687 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5688 else [val] if val in alias_dict['all'] else None)
5689 if current is None:
5690 raise ValueError(val)
5691
5692 if discard:
5693 for item in current:
5694 while item in requested:
5695 requested.remove(item)
5696 else:
5697 requested.extend(current)
5698
5699 return orderedSet(requested)
5700
5701
5702 # TODO: Rewrite
5703 class FormatSorter:
5704 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5705
5706 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5707 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5708 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5709 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5710 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5711 'fps', 'fs_approx', 'source', 'id')
5712
5713 settings = {
5714 'vcodec': {'type': 'ordered', 'regex': True,
5715 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5716 'acodec': {'type': 'ordered', 'regex': True,
5717 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5718 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5719 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5720 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5721 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5722 'vext': {'type': 'ordered', 'field': 'video_ext',
5723 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5724 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5725 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5726 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5727 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5728 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5729 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5730 'field': ('vcodec', 'acodec'),
5731 'function': lambda it: int(any(v != 'none' for v in it))},
5732 'ie_pref': {'priority': True, 'type': 'extractor'},
5733 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5734 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5735 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5736 'quality': {'convert': 'float', 'default': -1},
5737 'filesize': {'convert': 'bytes'},
5738 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5739 'id': {'convert': 'string', 'field': 'format_id'},
5740 'height': {'convert': 'float_none'},
5741 'width': {'convert': 'float_none'},
5742 'fps': {'convert': 'float_none'},
5743 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5744 'tbr': {'convert': 'float_none'},
5745 'vbr': {'convert': 'float_none'},
5746 'abr': {'convert': 'float_none'},
5747 'asr': {'convert': 'float_none'},
5748 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5749
5750 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5751 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'), 'convert': 'float_none',
5752 'function': lambda it: next(filter(None, it), None)},
5753 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'), 'convert': 'bytes',
5754 'function': lambda it: next(filter(None, it), None)},
5755 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5756 'res': {'type': 'multiple', 'field': ('height', 'width'),
5757 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5758
5759 # Actual field names
5760 'format_id': {'type': 'alias', 'field': 'id'},
5761 'preference': {'type': 'alias', 'field': 'ie_pref'},
5762 'language_preference': {'type': 'alias', 'field': 'lang'},
5763 'source_preference': {'type': 'alias', 'field': 'source'},
5764 'protocol': {'type': 'alias', 'field': 'proto'},
5765 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5766 'audio_channels': {'type': 'alias', 'field': 'channels'},
5767
5768 # Deprecated
5769 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5770 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5771 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5772 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5773 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5774 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5775 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5776 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5777 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5778 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5779 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5780 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5781 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5782 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5783 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5784 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5785 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5786 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5787 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5788 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5789 }
5790
5791 def __init__(self, ydl, field_preference):
5792 self.ydl = ydl
5793 self._order = []
5794 self.evaluate_params(self.ydl.params, field_preference)
5795 if ydl.params.get('verbose'):
5796 self.print_verbose_info(self.ydl.write_debug)
5797
5798 def _get_field_setting(self, field, key):
5799 if field not in self.settings:
5800 if key in ('forced', 'priority'):
5801 return False
5802 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5803 'deprecated and may be removed in a future version')
5804 self.settings[field] = {}
5805 propObj = self.settings[field]
5806 if key not in propObj:
5807 type = propObj.get('type')
5808 if key == 'field':
5809 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5810 elif key == 'convert':
5811 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5812 else:
5813 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5814 propObj[key] = default
5815 return propObj[key]
5816
5817 def _resolve_field_value(self, field, value, convertNone=False):
5818 if value is None:
5819 if not convertNone:
5820 return None
5821 else:
5822 value = value.lower()
5823 conversion = self._get_field_setting(field, 'convert')
5824 if conversion == 'ignore':
5825 return None
5826 if conversion == 'string':
5827 return value
5828 elif conversion == 'float_none':
5829 return float_or_none(value)
5830 elif conversion == 'bytes':
5831 return parse_bytes(value)
5832 elif conversion == 'order':
5833 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5834 use_regex = self._get_field_setting(field, 'regex')
5835 list_length = len(order_list)
5836 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5837 if use_regex and value is not None:
5838 for i, regex in enumerate(order_list):
5839 if regex and re.match(regex, value):
5840 return list_length - i
5841 return list_length - empty_pos # not in list
5842 else: # not regex or value = None
5843 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5844 else:
5845 if value.isnumeric():
5846 return float(value)
5847 else:
5848 self.settings[field]['convert'] = 'string'
5849 return value
5850
5851 def evaluate_params(self, params, sort_extractor):
5852 self._use_free_order = params.get('prefer_free_formats', False)
5853 self._sort_user = params.get('format_sort', [])
5854 self._sort_extractor = sort_extractor
5855
5856 def add_item(field, reverse, closest, limit_text):
5857 field = field.lower()
5858 if field in self._order:
5859 return
5860 self._order.append(field)
5861 limit = self._resolve_field_value(field, limit_text)
5862 data = {
5863 'reverse': reverse,
5864 'closest': False if limit is None else closest,
5865 'limit_text': limit_text,
5866 'limit': limit}
5867 if field in self.settings:
5868 self.settings[field].update(data)
5869 else:
5870 self.settings[field] = data
5871
5872 sort_list = (
5873 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5874 + (tuple() if params.get('format_sort_force', False)
5875 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5876 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5877
5878 for item in sort_list:
5879 match = re.match(self.regex, item)
5880 if match is None:
5881 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5882 field = match.group('field')
5883 if field is None:
5884 continue
5885 if self._get_field_setting(field, 'type') == 'alias':
5886 alias, field = field, self._get_field_setting(field, 'field')
5887 if self._get_field_setting(alias, 'deprecated'):
5888 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5889 f'be removed in a future version. Please use {field} instead')
5890 reverse = match.group('reverse') is not None
5891 closest = match.group('separator') == '~'
5892 limit_text = match.group('limit')
5893
5894 has_limit = limit_text is not None
5895 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5896 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5897
5898 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5899 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5900 limit_count = len(limits)
5901 for (i, f) in enumerate(fields):
5902 add_item(f, reverse, closest,
5903 limits[i] if i < limit_count
5904 else limits[0] if has_limit and not has_multiple_limits
5905 else None)
5906
5907 def print_verbose_info(self, write_debug):
5908 if self._sort_user:
5909 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5910 if self._sort_extractor:
5911 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5912 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5913 '+' if self._get_field_setting(field, 'reverse') else '', field,
5914 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5915 self._get_field_setting(field, 'limit_text'),
5916 self._get_field_setting(field, 'limit'))
5917 if self._get_field_setting(field, 'limit_text') is not None else '')
5918 for field in self._order if self._get_field_setting(field, 'visible')]))
5919
5920 def _calculate_field_preference_from_value(self, format, field, type, value):
5921 reverse = self._get_field_setting(field, 'reverse')
5922 closest = self._get_field_setting(field, 'closest')
5923 limit = self._get_field_setting(field, 'limit')
5924
5925 if type == 'extractor':
5926 maximum = self._get_field_setting(field, 'max')
5927 if value is None or (maximum is not None and value >= maximum):
5928 value = -1
5929 elif type == 'boolean':
5930 in_list = self._get_field_setting(field, 'in_list')
5931 not_in_list = self._get_field_setting(field, 'not_in_list')
5932 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5933 elif type == 'ordered':
5934 value = self._resolve_field_value(field, value, True)
5935
5936 # try to convert to number
5937 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5938 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5939 if is_num:
5940 value = val_num
5941
5942 return ((-10, 0) if value is None
5943 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5944 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5945 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5946 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5947 else (-1, value, 0))
5948
5949 def _calculate_field_preference(self, format, field):
5950 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5951 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5952 if type == 'multiple':
5953 type = 'field' # Only 'field' is allowed in multiple for now
5954 actual_fields = self._get_field_setting(field, 'field')
5955
5956 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5957 else:
5958 value = get_value(field)
5959 return self._calculate_field_preference_from_value(format, field, type, value)
5960
5961 def calculate_preference(self, format):
5962 # Determine missing protocol
5963 if not format.get('protocol'):
5964 format['protocol'] = determine_protocol(format)
5965
5966 # Determine missing ext
5967 if not format.get('ext') and 'url' in format:
5968 format['ext'] = determine_ext(format['url'])
5969 if format.get('vcodec') == 'none':
5970 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5971 format['video_ext'] = 'none'
5972 else:
5973 format['video_ext'] = format['ext']
5974 format['audio_ext'] = 'none'
5975 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5976 # format['preference'] = -1000
5977
5978 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5979 # HEVC-over-FLV is out-of-spec by FLV's original spec
5980 # ref. https://trac.ffmpeg.org/ticket/6389
5981 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5982 format['preference'] = -100
5983
5984 # Determine missing bitrates
5985 if format.get('vcodec') == 'none':
5986 format['vbr'] = 0
5987 if format.get('acodec') == 'none':
5988 format['abr'] = 0
5989 if not format.get('vbr') and format.get('vcodec') != 'none':
5990 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5991 if not format.get('abr') and format.get('acodec') != 'none':
5992 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5993 if not format.get('tbr'):
5994 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5995
5996 return tuple(self._calculate_field_preference(format, field) for field in self._order)
5997
5998
5999 # XXX: Temporary
6000 class _YDLLogger:
6001 def __init__(self, ydl=None):
6002 self._ydl = ydl
6003
6004 def debug(self, message):
6005 if self._ydl:
6006 self._ydl.write_debug(message)
6007
6008 def info(self, message):
6009 if self._ydl:
6010 self._ydl.to_screen(message)
6011
6012 def warning(self, message, *, once=False):
6013 if self._ydl:
6014 self._ydl.report_warning(message, only_once=once)
6015
6016 def error(self, message, *, is_error=True):
6017 if self._ydl:
6018 self._ydl.report_error(message, is_error=is_error)
6019
6020 def stdout(self, message):
6021 if self._ydl:
6022 self._ydl.to_stdout(message)
6023
6024 def stderr(self, message):
6025 if self._ydl:
6026 self._ydl.to_stderr(message)