]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils/_utils.py
Add option `--netrc-cmd` (#6682)
[yt-dlp.git] / yt_dlp / utils / _utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import html.entities
18 import html.parser
19 import http.client
20 import http.cookiejar
21 import inspect
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import mimetypes
28 import netrc
29 import operator
30 import os
31 import platform
32 import random
33 import re
34 import shlex
35 import socket
36 import ssl
37 import struct
38 import subprocess
39 import sys
40 import tempfile
41 import time
42 import traceback
43 import types
44 import unicodedata
45 import urllib.error
46 import urllib.parse
47 import urllib.request
48 import xml.etree.ElementTree
49 import zlib
50
51 from . import traversal
52
53 from ..compat import functools # isort: split
54 from ..compat import (
55 compat_etree_fromstring,
56 compat_expanduser,
57 compat_HTMLParseError,
58 compat_os_name,
59 compat_shlex_quote,
60 )
61 from ..dependencies import brotli, certifi, websockets, xattr
62 from ..socks import ProxyType, sockssocket
63
64 __name__ = __name__.rsplit('.', 1)[0] # Pretend to be the parent module
65
66 # This is not clearly defined otherwise
67 compiled_regex_type = type(re.compile(''))
68
69
70 def random_user_agent():
71 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
72 _CHROME_VERSIONS = (
73 '90.0.4430.212',
74 '90.0.4430.24',
75 '90.0.4430.70',
76 '90.0.4430.72',
77 '90.0.4430.85',
78 '90.0.4430.93',
79 '91.0.4472.101',
80 '91.0.4472.106',
81 '91.0.4472.114',
82 '91.0.4472.124',
83 '91.0.4472.164',
84 '91.0.4472.19',
85 '91.0.4472.77',
86 '92.0.4515.107',
87 '92.0.4515.115',
88 '92.0.4515.131',
89 '92.0.4515.159',
90 '92.0.4515.43',
91 '93.0.4556.0',
92 '93.0.4577.15',
93 '93.0.4577.63',
94 '93.0.4577.82',
95 '94.0.4606.41',
96 '94.0.4606.54',
97 '94.0.4606.61',
98 '94.0.4606.71',
99 '94.0.4606.81',
100 '94.0.4606.85',
101 '95.0.4638.17',
102 '95.0.4638.50',
103 '95.0.4638.54',
104 '95.0.4638.69',
105 '95.0.4638.74',
106 '96.0.4664.18',
107 '96.0.4664.45',
108 '96.0.4664.55',
109 '96.0.4664.93',
110 '97.0.4692.20',
111 )
112 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
113
114
115 SUPPORTED_ENCODINGS = [
116 'gzip', 'deflate'
117 ]
118 if brotli:
119 SUPPORTED_ENCODINGS.append('br')
120
121 std_headers = {
122 'User-Agent': random_user_agent(),
123 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
124 'Accept-Language': 'en-us,en;q=0.5',
125 'Sec-Fetch-Mode': 'navigate',
126 }
127
128
129 USER_AGENTS = {
130 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
131 }
132
133
134 class NO_DEFAULT:
135 pass
136
137
138 def IDENTITY(x):
139 return x
140
141
142 ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
146 MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
151 # these follow the genitive grammatical case (dopełniacz)
152 # some websites might be using nominative, which will require another month list
153 # https://en.wikibooks.org/wiki/Polish/Noun_cases
154 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
155 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
156 }
157
158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
159 TIMEZONE_NAMES = {
160 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
161 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
162 'EST': -5, 'EDT': -4, # Eastern
163 'CST': -6, 'CDT': -5, # Central
164 'MST': -7, 'MDT': -6, # Mountain
165 'PST': -8, 'PDT': -7 # Pacific
166 }
167
168 # needed for sanitizing filenames in restricted mode
169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
170 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
171 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
172
173 DATE_FORMATS = (
174 '%d %B %Y',
175 '%d %b %Y',
176 '%B %d %Y',
177 '%B %dst %Y',
178 '%B %dnd %Y',
179 '%B %drd %Y',
180 '%B %dth %Y',
181 '%b %d %Y',
182 '%b %dst %Y',
183 '%b %dnd %Y',
184 '%b %drd %Y',
185 '%b %dth %Y',
186 '%b %dst %Y %I:%M',
187 '%b %dnd %Y %I:%M',
188 '%b %drd %Y %I:%M',
189 '%b %dth %Y %I:%M',
190 '%Y %m %d',
191 '%Y-%m-%d',
192 '%Y.%m.%d.',
193 '%Y/%m/%d',
194 '%Y/%m/%d %H:%M',
195 '%Y/%m/%d %H:%M:%S',
196 '%Y%m%d%H%M',
197 '%Y%m%d%H%M%S',
198 '%Y%m%d',
199 '%Y-%m-%d %H:%M',
200 '%Y-%m-%d %H:%M:%S',
201 '%Y-%m-%d %H:%M:%S.%f',
202 '%Y-%m-%d %H:%M:%S:%f',
203 '%d.%m.%Y %H:%M',
204 '%d.%m.%Y %H.%M',
205 '%Y-%m-%dT%H:%M:%SZ',
206 '%Y-%m-%dT%H:%M:%S.%fZ',
207 '%Y-%m-%dT%H:%M:%S.%f0Z',
208 '%Y-%m-%dT%H:%M:%S',
209 '%Y-%m-%dT%H:%M:%S.%f',
210 '%Y-%m-%dT%H:%M',
211 '%b %d %Y at %H:%M',
212 '%b %d %Y at %H:%M:%S',
213 '%B %d %Y at %H:%M',
214 '%B %d %Y at %H:%M:%S',
215 '%H:%M %d-%b-%Y',
216 )
217
218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
219 DATE_FORMATS_DAY_FIRST.extend([
220 '%d-%m-%Y',
221 '%d.%m.%Y',
222 '%d.%m.%y',
223 '%d/%m/%Y',
224 '%d/%m/%y',
225 '%d/%m/%Y %H:%M:%S',
226 '%d-%m-%Y %H:%M',
227 '%H:%M %d/%m/%Y',
228 ])
229
230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
231 DATE_FORMATS_MONTH_FIRST.extend([
232 '%m-%d-%Y',
233 '%m.%d.%Y',
234 '%m/%d/%Y',
235 '%m/%d/%y',
236 '%m/%d/%Y %H:%M:%S',
237 ])
238
239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
241
242 NUMBER_RE = r'\d+(?:\.\d+)?'
243
244
245 @functools.cache
246 def preferredencoding():
247 """Get preferred encoding.
248
249 Returns the best encoding scheme for the system, based on
250 locale.getpreferredencoding() and some further tweaks.
251 """
252 try:
253 pref = locale.getpreferredencoding()
254 'TEST'.encode(pref)
255 except Exception:
256 pref = 'UTF-8'
257
258 return pref
259
260
261 def write_json_file(obj, fn):
262 """ Encode obj as JSON and write it to fn, atomically if possible """
263
264 tf = tempfile.NamedTemporaryFile(
265 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
266 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
267
268 try:
269 with tf:
270 json.dump(obj, tf, ensure_ascii=False)
271 if sys.platform == 'win32':
272 # Need to remove existing file on Windows, else os.rename raises
273 # WindowsError or FileExistsError.
274 with contextlib.suppress(OSError):
275 os.unlink(fn)
276 with contextlib.suppress(OSError):
277 mask = os.umask(0)
278 os.umask(mask)
279 os.chmod(tf.name, 0o666 & ~mask)
280 os.rename(tf.name, fn)
281 except Exception:
282 with contextlib.suppress(OSError):
283 os.remove(tf.name)
284 raise
285
286
287 def find_xpath_attr(node, xpath, key, val=None):
288 """ Find the xpath xpath[@key=val] """
289 assert re.match(r'^[a-zA-Z_-]+$', key)
290 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
291 return node.find(expr)
292
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295
296
297 def xpath_with_ns(path, ns_map):
298 components = [c.split(':') for c in path.split('/')]
299 replaced = []
300 for c in components:
301 if len(c) == 1:
302 replaced.append(c[0])
303 else:
304 ns, tag = c
305 replaced.append('{%s}%s' % (ns_map[ns], tag))
306 return '/'.join(replaced)
307
308
309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
310 def _find_xpath(xpath):
311 return node.find(xpath)
312
313 if isinstance(xpath, str):
314 n = _find_xpath(xpath)
315 else:
316 for xp in xpath:
317 n = _find_xpath(xp)
318 if n is not None:
319 break
320
321 if n is None:
322 if default is not NO_DEFAULT:
323 return default
324 elif fatal:
325 name = xpath if name is None else name
326 raise ExtractorError('Could not find XML element %s' % name)
327 else:
328 return None
329 return n
330
331
332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
333 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
334 if n is None or n == default:
335 return n
336 if n.text is None:
337 if default is not NO_DEFAULT:
338 return default
339 elif fatal:
340 name = xpath if name is None else name
341 raise ExtractorError('Could not find XML element\'s text %s' % name)
342 else:
343 return None
344 return n.text
345
346
347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
348 n = find_xpath_attr(node, xpath, key)
349 if n is None:
350 if default is not NO_DEFAULT:
351 return default
352 elif fatal:
353 name = f'{xpath}[@{key}]' if name is None else name
354 raise ExtractorError('Could not find XML attribute %s' % name)
355 else:
356 return None
357 return n.attrib[key]
358
359
360 def get_element_by_id(id, html, **kwargs):
361 """Return the content of the tag with the specified ID in the passed HTML document"""
362 return get_element_by_attribute('id', id, html, **kwargs)
363
364
365 def get_element_html_by_id(id, html, **kwargs):
366 """Return the html of the tag with the specified ID in the passed HTML document"""
367 return get_element_html_by_attribute('id', id, html, **kwargs)
368
369
370 def get_element_by_class(class_name, html):
371 """Return the content of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
376 def get_element_html_by_class(class_name, html):
377 """Return the html of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_html_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
382 def get_element_by_attribute(attribute, value, html, **kwargs):
383 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
384 return retval[0] if retval else None
385
386
387 def get_element_html_by_attribute(attribute, value, html, **kargs):
388 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
389 return retval[0] if retval else None
390
391
392 def get_elements_by_class(class_name, html, **kargs):
393 """Return the content of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_by_attribute(
395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
396 html, escape_value=False)
397
398
399 def get_elements_html_by_class(class_name, html):
400 """Return the html of all tags with the specified class in the passed HTML document as a list"""
401 return get_elements_html_by_attribute(
402 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
403 html, escape_value=False)
404
405
406 def get_elements_by_attribute(*args, **kwargs):
407 """Return the content of the tag with the specified attribute in the passed HTML document"""
408 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411 def get_elements_html_by_attribute(*args, **kwargs):
412 """Return the html of the tag with the specified attribute in the passed HTML document"""
413 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
417 """
418 Return the text (content) and the html (whole) of the tag with the specified
419 attribute in the passed HTML document
420 """
421 if not value:
422 return
423
424 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
425
426 value = re.escape(value) if escape_value else value
427
428 partial_element_re = rf'''(?x)
429 <(?P<tag>{tag})
430 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
431 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
432 '''
433
434 for m in re.finditer(partial_element_re, html):
435 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
436
437 yield (
438 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
439 whole
440 )
441
442
443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
444 """
445 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
446 closing tag for the first opening tag it has encountered, and can be used
447 as a context manager
448 """
449
450 class HTMLBreakOnClosingTagException(Exception):
451 pass
452
453 def __init__(self):
454 self.tagstack = collections.deque()
455 html.parser.HTMLParser.__init__(self)
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, *_):
461 self.close()
462
463 def close(self):
464 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
465 # so data remains buffered; we no longer have any interest in it, thus
466 # override this method to discard it
467 pass
468
469 def handle_starttag(self, tag, _):
470 self.tagstack.append(tag)
471
472 def handle_endtag(self, tag):
473 if not self.tagstack:
474 raise compat_HTMLParseError('no tags in the stack')
475 while self.tagstack:
476 inner_tag = self.tagstack.pop()
477 if inner_tag == tag:
478 break
479 else:
480 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
481 if not self.tagstack:
482 raise self.HTMLBreakOnClosingTagException()
483
484
485 # XXX: This should be far less strict
486 def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
521 class HTMLAttributeParser(html.parser.HTMLParser):
522 """Trivial HTML parser to gather the attributes for a single element"""
523
524 def __init__(self):
525 self.attrs = {}
526 html.parser.HTMLParser.__init__(self)
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
530 raise compat_HTMLParseError('done')
531
532
533 class HTMLListAttrsParser(html.parser.HTMLParser):
534 """HTML parser to gather the attributes for the elements of a list"""
535
536 def __init__(self):
537 html.parser.HTMLParser.__init__(self)
538 self.items = []
539 self._level = 0
540
541 def handle_starttag(self, tag, attrs):
542 if tag == 'li' and self._level == 0:
543 self.items.append(dict(attrs))
544 self._level += 1
545
546 def handle_endtag(self, tag):
547 self._level -= 1
548
549
550 def extract_attributes(html_element):
551 """Given a string for an HTML element such as
552 <el
553 a="foo" B="bar" c="&98;az" d=boz
554 empty= noval entity="&amp;"
555 sq='"' dq="'"
556 >
557 Decode and return a dictionary of attributes.
558 {
559 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
560 'empty': '', 'noval': None, 'entity': '&',
561 'sq': '"', 'dq': '\''
562 }.
563 """
564 parser = HTMLAttributeParser()
565 with contextlib.suppress(compat_HTMLParseError):
566 parser.feed(html_element)
567 parser.close()
568 return parser.attrs
569
570
571 def parse_list(webpage):
572 """Given a string for an series of HTML <li> elements,
573 return a dictionary of their attributes"""
574 parser = HTMLListAttrsParser()
575 parser.feed(webpage)
576 parser.close()
577 return parser.items
578
579
580 def clean_html(html):
581 """Clean an HTML snippet into a readable string"""
582
583 if html is None: # Convenience for sanitizing descriptions etc.
584 return html
585
586 html = re.sub(r'\s+', ' ', html)
587 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
588 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
589 # Strip html tags
590 html = re.sub('<.*?>', '', html)
591 # Replace html entities
592 html = unescapeHTML(html)
593 return html.strip()
594
595
596 class LenientJSONDecoder(json.JSONDecoder):
597 # TODO: Write tests
598 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
599 self.transform_source, self.ignore_extra = transform_source, ignore_extra
600 self._close_attempts = 2 * close_objects
601 super().__init__(*args, **kwargs)
602
603 @staticmethod
604 def _close_object(err):
605 doc = err.doc[:err.pos]
606 # We need to add comma first to get the correct error message
607 if err.msg.startswith('Expecting \',\''):
608 return doc + ','
609 elif not doc.endswith(','):
610 return
611
612 if err.msg.startswith('Expecting property name'):
613 return doc[:-1] + '}'
614 elif err.msg.startswith('Expecting value'):
615 return doc[:-1] + ']'
616
617 def decode(self, s):
618 if self.transform_source:
619 s = self.transform_source(s)
620 for attempt in range(self._close_attempts + 1):
621 try:
622 if self.ignore_extra:
623 return self.raw_decode(s.lstrip())[0]
624 return super().decode(s)
625 except json.JSONDecodeError as e:
626 if e.pos is None:
627 raise
628 elif attempt < self._close_attempts:
629 s = self._close_object(e)
630 if s is not None:
631 continue
632 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
633 assert False, 'Too many attempts to decode JSON'
634
635
636 def sanitize_open(filename, open_mode):
637 """Try to open the given filename, and slightly tweak it if this fails.
638
639 Attempts to open the given filename. If this fails, it tries to change
640 the filename slightly, step by step, until it's either able to open it
641 or it fails and raises a final exception, like the standard open()
642 function.
643
644 It returns the tuple (stream, definitive_file_name).
645 """
646 if filename == '-':
647 if sys.platform == 'win32':
648 import msvcrt
649
650 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
651 with contextlib.suppress(io.UnsupportedOperation):
652 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
653 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
654
655 for attempt in range(2):
656 try:
657 try:
658 if sys.platform == 'win32':
659 # FIXME: An exclusive lock also locks the file from being read.
660 # Since windows locks are mandatory, don't lock the file on windows (for now).
661 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
662 raise LockingUnsupportedError()
663 stream = locked_file(filename, open_mode, block=False).__enter__()
664 except OSError:
665 stream = open(filename, open_mode)
666 return stream, filename
667 except OSError as err:
668 if attempt or err.errno in (errno.EACCES,):
669 raise
670 old_filename, filename = filename, sanitize_path(filename)
671 if old_filename == filename:
672 raise
673
674
675 def timeconvert(timestr):
676 """Convert RFC 2822 defined time string into system timestamp"""
677 timestamp = None
678 timetuple = email.utils.parsedate_tz(timestr)
679 if timetuple is not None:
680 timestamp = email.utils.mktime_tz(timetuple)
681 return timestamp
682
683
684 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
685 """Sanitizes a string so it could be used as part of a filename.
686 @param restricted Use a stricter subset of allowed characters
687 @param is_id Whether this is an ID that should be kept unchanged if possible.
688 If unset, yt-dlp's new sanitization rules are in effect
689 """
690 if s == '':
691 return ''
692
693 def replace_insane(char):
694 if restricted and char in ACCENT_CHARS:
695 return ACCENT_CHARS[char]
696 elif not restricted and char == '\n':
697 return '\0 '
698 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
699 # Replace with their full-width unicode counterparts
700 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
701 elif char == '?' or ord(char) < 32 or ord(char) == 127:
702 return ''
703 elif char == '"':
704 return '' if restricted else '\''
705 elif char == ':':
706 return '\0_\0-' if restricted else '\0 \0-'
707 elif char in '\\/|*<>':
708 return '\0_'
709 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
710 return '\0_'
711 return char
712
713 # Replace look-alike Unicode glyphs
714 if restricted and (is_id is NO_DEFAULT or not is_id):
715 s = unicodedata.normalize('NFKC', s)
716 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
717 result = ''.join(map(replace_insane, s))
718 if is_id is NO_DEFAULT:
719 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
720 STRIP_RE = r'(?:\0.|[ _-])*'
721 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
722 result = result.replace('\0', '') or '_'
723
724 if not is_id:
725 while '__' in result:
726 result = result.replace('__', '_')
727 result = result.strip('_')
728 # Common case of "Foreign band name - English song title"
729 if restricted and result.startswith('-_'):
730 result = result[2:]
731 if result.startswith('-'):
732 result = '_' + result[len('-'):]
733 result = result.lstrip('.')
734 if not result:
735 result = '_'
736 return result
737
738
739 def sanitize_path(s, force=False):
740 """Sanitizes and normalizes path on Windows"""
741 if sys.platform == 'win32':
742 force = False
743 drive_or_unc, _ = os.path.splitdrive(s)
744 elif force:
745 drive_or_unc = ''
746 else:
747 return s
748
749 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
750 if drive_or_unc:
751 norm_path.pop(0)
752 sanitized_path = [
753 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
754 for path_part in norm_path]
755 if drive_or_unc:
756 sanitized_path.insert(0, drive_or_unc + os.path.sep)
757 elif force and s and s[0] == os.path.sep:
758 sanitized_path.insert(0, os.path.sep)
759 return os.path.join(*sanitized_path)
760
761
762 def sanitize_url(url, *, scheme='http'):
763 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
764 # the number of unwanted failures due to missing protocol
765 if url is None:
766 return
767 elif url.startswith('//'):
768 return f'{scheme}:{url}'
769 # Fix some common typos seen so far
770 COMMON_TYPOS = (
771 # https://github.com/ytdl-org/youtube-dl/issues/15649
772 (r'^httpss://', r'https://'),
773 # https://bx1.be/lives/direct-tv/
774 (r'^rmtp([es]?)://', r'rtmp\1://'),
775 )
776 for mistake, fixup in COMMON_TYPOS:
777 if re.match(mistake, url):
778 return re.sub(mistake, fixup, url)
779 return url
780
781
782 def extract_basic_auth(url):
783 parts = urllib.parse.urlsplit(url)
784 if parts.username is None:
785 return url, None
786 url = urllib.parse.urlunsplit(parts._replace(netloc=(
787 parts.hostname if parts.port is None
788 else '%s:%d' % (parts.hostname, parts.port))))
789 auth_payload = base64.b64encode(
790 ('%s:%s' % (parts.username, parts.password or '')).encode())
791 return url, f'Basic {auth_payload.decode()}'
792
793
794 def sanitized_Request(url, *args, **kwargs):
795 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
796 if auth_header is not None:
797 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
798 headers['Authorization'] = auth_header
799 return urllib.request.Request(url, *args, **kwargs)
800
801
802 def expand_path(s):
803 """Expand shell variables and ~"""
804 return os.path.expandvars(compat_expanduser(s))
805
806
807 def orderedSet(iterable, *, lazy=False):
808 """Remove all duplicates from the input iterable"""
809 def _iter():
810 seen = [] # Do not use set since the items can be unhashable
811 for x in iterable:
812 if x not in seen:
813 seen.append(x)
814 yield x
815
816 return _iter() if lazy else list(_iter())
817
818
819 def _htmlentity_transform(entity_with_semicolon):
820 """Transforms an HTML entity to a character."""
821 entity = entity_with_semicolon[:-1]
822
823 # Known non-numeric HTML entity
824 if entity in html.entities.name2codepoint:
825 return chr(html.entities.name2codepoint[entity])
826
827 # TODO: HTML5 allows entities without a semicolon.
828 # E.g. '&Eacuteric' should be decoded as 'Éric'.
829 if entity_with_semicolon in html.entities.html5:
830 return html.entities.html5[entity_with_semicolon]
831
832 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
833 if mobj is not None:
834 numstr = mobj.group(1)
835 if numstr.startswith('x'):
836 base = 16
837 numstr = '0%s' % numstr
838 else:
839 base = 10
840 # See https://github.com/ytdl-org/youtube-dl/issues/7518
841 with contextlib.suppress(ValueError):
842 return chr(int(numstr, base))
843
844 # Unknown entity in name, return its literal representation
845 return '&%s;' % entity
846
847
848 def unescapeHTML(s):
849 if s is None:
850 return None
851 assert isinstance(s, str)
852
853 return re.sub(
854 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
855
856
857 def escapeHTML(text):
858 return (
859 text
860 .replace('&', '&amp;')
861 .replace('<', '&lt;')
862 .replace('>', '&gt;')
863 .replace('"', '&quot;')
864 .replace("'", '&#39;')
865 )
866
867
868 class netrc_from_content(netrc.netrc):
869 def __init__(self, content):
870 self.hosts, self.macros = {}, {}
871 with io.StringIO(content) as stream:
872 self._parse('-', stream, False)
873
874
875 def process_communicate_or_kill(p, *args, **kwargs):
876 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
877 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
878 return Popen.communicate_or_kill(p, *args, **kwargs)
879
880
881 class Popen(subprocess.Popen):
882 if sys.platform == 'win32':
883 _startupinfo = subprocess.STARTUPINFO()
884 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
885 else:
886 _startupinfo = None
887
888 @staticmethod
889 def _fix_pyinstaller_ld_path(env):
890 """Restore LD_LIBRARY_PATH when using PyInstaller
891 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
892 https://github.com/yt-dlp/yt-dlp/issues/4573
893 """
894 if not hasattr(sys, '_MEIPASS'):
895 return
896
897 def _fix(key):
898 orig = env.get(f'{key}_ORIG')
899 if orig is None:
900 env.pop(key, None)
901 else:
902 env[key] = orig
903
904 _fix('LD_LIBRARY_PATH') # Linux
905 _fix('DYLD_LIBRARY_PATH') # macOS
906
907 def __init__(self, *args, env=None, text=False, **kwargs):
908 if env is None:
909 env = os.environ.copy()
910 self._fix_pyinstaller_ld_path(env)
911
912 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
913 if text is True:
914 kwargs['universal_newlines'] = True # For 3.6 compatibility
915 kwargs.setdefault('encoding', 'utf-8')
916 kwargs.setdefault('errors', 'replace')
917 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
918
919 def communicate_or_kill(self, *args, **kwargs):
920 try:
921 return self.communicate(*args, **kwargs)
922 except BaseException: # Including KeyboardInterrupt
923 self.kill(timeout=None)
924 raise
925
926 def kill(self, *, timeout=0):
927 super().kill()
928 if timeout != 0:
929 self.wait(timeout=timeout)
930
931 @classmethod
932 def run(cls, *args, timeout=None, **kwargs):
933 with cls(*args, **kwargs) as proc:
934 default = '' if proc.__text_mode else b''
935 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
936 return stdout or default, stderr or default, proc.returncode
937
938
939 def encodeArgument(s):
940 # Legacy code that uses byte strings
941 # Uncomment the following line after fixing all post processors
942 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
943 return s if isinstance(s, str) else s.decode('ascii')
944
945
946 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
947
948
949 def timetuple_from_msec(msec):
950 secs, msec = divmod(msec, 1000)
951 mins, secs = divmod(secs, 60)
952 hrs, mins = divmod(mins, 60)
953 return _timetuple(hrs, mins, secs, msec)
954
955
956 def formatSeconds(secs, delim=':', msec=False):
957 time = timetuple_from_msec(secs * 1000)
958 if time.hours:
959 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
960 elif time.minutes:
961 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
962 else:
963 ret = '%d' % time.seconds
964 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
965
966
967 def _ssl_load_windows_store_certs(ssl_context, storename):
968 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
969 try:
970 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
971 if encoding == 'x509_asn' and (
972 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
973 except PermissionError:
974 return
975 for cert in certs:
976 with contextlib.suppress(ssl.SSLError):
977 ssl_context.load_verify_locations(cadata=cert)
978
979
980 def make_HTTPS_handler(params, **kwargs):
981 opts_check_certificate = not params.get('nocheckcertificate')
982 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
983 context.check_hostname = opts_check_certificate
984 if params.get('legacyserverconnect'):
985 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
986 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
987 context.set_ciphers('DEFAULT')
988 elif (
989 sys.version_info < (3, 10)
990 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
991 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
992 ):
993 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
994 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
995 # in some situations [2][3].
996 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
997 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
998 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
999 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1000 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1001 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1002 # 4. https://peps.python.org/pep-0644/
1003 # 5. https://peps.python.org/pep-0644/#libressl-support
1004 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1005 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1006 context.minimum_version = ssl.TLSVersion.TLSv1_2
1007
1008 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1009 if opts_check_certificate:
1010 if certifi and 'no-certifi' not in params.get('compat_opts', []):
1011 context.load_verify_locations(cafile=certifi.where())
1012 else:
1013 try:
1014 context.load_default_certs()
1015 # Work around the issue in load_default_certs when there are bad certificates. See:
1016 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1017 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1018 except ssl.SSLError:
1019 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1020 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1021 for storename in ('CA', 'ROOT'):
1022 _ssl_load_windows_store_certs(context, storename)
1023 context.set_default_verify_paths()
1024
1025 client_certfile = params.get('client_certificate')
1026 if client_certfile:
1027 try:
1028 context.load_cert_chain(
1029 client_certfile, keyfile=params.get('client_certificate_key'),
1030 password=params.get('client_certificate_password'))
1031 except ssl.SSLError:
1032 raise YoutubeDLError('Unable to load client certificate')
1033
1034 # Some servers may reject requests if ALPN extension is not sent. See:
1035 # https://github.com/python/cpython/issues/85140
1036 # https://github.com/yt-dlp/yt-dlp/issues/3878
1037 with contextlib.suppress(NotImplementedError):
1038 context.set_alpn_protocols(['http/1.1'])
1039
1040 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1041
1042
1043 def bug_reports_message(before=';'):
1044 from ..update import REPOSITORY
1045
1046 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1047 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1048
1049 before = before.rstrip()
1050 if not before or before.endswith(('.', '!', '?')):
1051 msg = msg[0].title() + msg[1:]
1052
1053 return (before + ' ' if before else '') + msg
1054
1055
1056 class YoutubeDLError(Exception):
1057 """Base exception for YoutubeDL errors."""
1058 msg = None
1059
1060 def __init__(self, msg=None):
1061 if msg is not None:
1062 self.msg = msg
1063 elif self.msg is None:
1064 self.msg = type(self).__name__
1065 super().__init__(self.msg)
1066
1067
1068 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1069 if hasattr(ssl, 'CertificateError'):
1070 network_exceptions.append(ssl.CertificateError)
1071 network_exceptions = tuple(network_exceptions)
1072
1073
1074 class ExtractorError(YoutubeDLError):
1075 """Error during info extraction."""
1076
1077 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1078 """ tb, if given, is the original traceback (so that it can be printed out).
1079 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1080 """
1081 if sys.exc_info()[0] in network_exceptions:
1082 expected = True
1083
1084 self.orig_msg = str(msg)
1085 self.traceback = tb
1086 self.expected = expected
1087 self.cause = cause
1088 self.video_id = video_id
1089 self.ie = ie
1090 self.exc_info = sys.exc_info() # preserve original exception
1091 if isinstance(self.exc_info[1], ExtractorError):
1092 self.exc_info = self.exc_info[1].exc_info
1093 super().__init__(self.__msg)
1094
1095 @property
1096 def __msg(self):
1097 return ''.join((
1098 format_field(self.ie, None, '[%s] '),
1099 format_field(self.video_id, None, '%s: '),
1100 self.orig_msg,
1101 format_field(self.cause, None, ' (caused by %r)'),
1102 '' if self.expected else bug_reports_message()))
1103
1104 def format_traceback(self):
1105 return join_nonempty(
1106 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1107 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1108 delim='\n') or None
1109
1110 def __setattr__(self, name, value):
1111 super().__setattr__(name, value)
1112 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1113 self.msg = self.__msg or type(self).__name__
1114 self.args = (self.msg, ) # Cannot be property
1115
1116
1117 class UnsupportedError(ExtractorError):
1118 def __init__(self, url):
1119 super().__init__(
1120 'Unsupported URL: %s' % url, expected=True)
1121 self.url = url
1122
1123
1124 class RegexNotFoundError(ExtractorError):
1125 """Error when a regex didn't match"""
1126 pass
1127
1128
1129 class GeoRestrictedError(ExtractorError):
1130 """Geographic restriction Error exception.
1131
1132 This exception may be thrown when a video is not available from your
1133 geographic location due to geographic restrictions imposed by a website.
1134 """
1135
1136 def __init__(self, msg, countries=None, **kwargs):
1137 kwargs['expected'] = True
1138 super().__init__(msg, **kwargs)
1139 self.countries = countries
1140
1141
1142 class UserNotLive(ExtractorError):
1143 """Error when a channel/user is not live"""
1144
1145 def __init__(self, msg=None, **kwargs):
1146 kwargs['expected'] = True
1147 super().__init__(msg or 'The channel is not currently live', **kwargs)
1148
1149
1150 class DownloadError(YoutubeDLError):
1151 """Download Error exception.
1152
1153 This exception may be thrown by FileDownloader objects if they are not
1154 configured to continue on errors. They will contain the appropriate
1155 error message.
1156 """
1157
1158 def __init__(self, msg, exc_info=None):
1159 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1160 super().__init__(msg)
1161 self.exc_info = exc_info
1162
1163
1164 class EntryNotInPlaylist(YoutubeDLError):
1165 """Entry not in playlist exception.
1166
1167 This exception will be thrown by YoutubeDL when a requested entry
1168 is not found in the playlist info_dict
1169 """
1170 msg = 'Entry not found in info'
1171
1172
1173 class SameFileError(YoutubeDLError):
1174 """Same File exception.
1175
1176 This exception will be thrown by FileDownloader objects if they detect
1177 multiple files would have to be downloaded to the same file on disk.
1178 """
1179 msg = 'Fixed output name but more than one file to download'
1180
1181 def __init__(self, filename=None):
1182 if filename is not None:
1183 self.msg += f': {filename}'
1184 super().__init__(self.msg)
1185
1186
1187 class PostProcessingError(YoutubeDLError):
1188 """Post Processing exception.
1189
1190 This exception may be raised by PostProcessor's .run() method to
1191 indicate an error in the postprocessing task.
1192 """
1193
1194
1195 class DownloadCancelled(YoutubeDLError):
1196 """ Exception raised when the download queue should be interrupted """
1197 msg = 'The download was cancelled'
1198
1199
1200 class ExistingVideoReached(DownloadCancelled):
1201 """ --break-on-existing triggered """
1202 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1203
1204
1205 class RejectedVideoReached(DownloadCancelled):
1206 """ --break-match-filter triggered """
1207 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1208
1209
1210 class MaxDownloadsReached(DownloadCancelled):
1211 """ --max-downloads limit has been reached. """
1212 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1213
1214
1215 class ReExtractInfo(YoutubeDLError):
1216 """ Video info needs to be re-extracted. """
1217
1218 def __init__(self, msg, expected=False):
1219 super().__init__(msg)
1220 self.expected = expected
1221
1222
1223 class ThrottledDownload(ReExtractInfo):
1224 """ Download speed below --throttled-rate. """
1225 msg = 'The download speed is below throttle limit'
1226
1227 def __init__(self):
1228 super().__init__(self.msg, expected=False)
1229
1230
1231 class UnavailableVideoError(YoutubeDLError):
1232 """Unavailable Format exception.
1233
1234 This exception will be thrown when a video is requested
1235 in a format that is not available for that video.
1236 """
1237 msg = 'Unable to download video'
1238
1239 def __init__(self, err=None):
1240 if err is not None:
1241 self.msg += f': {err}'
1242 super().__init__(self.msg)
1243
1244
1245 class ContentTooShortError(YoutubeDLError):
1246 """Content Too Short exception.
1247
1248 This exception may be raised by FileDownloader objects when a file they
1249 download is too small for what the server announced first, indicating
1250 the connection was probably interrupted.
1251 """
1252
1253 def __init__(self, downloaded, expected):
1254 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1255 # Both in bytes
1256 self.downloaded = downloaded
1257 self.expected = expected
1258
1259
1260 class XAttrMetadataError(YoutubeDLError):
1261 def __init__(self, code=None, msg='Unknown error'):
1262 super().__init__(msg)
1263 self.code = code
1264 self.msg = msg
1265
1266 # Parsing code and msg
1267 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1268 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1269 self.reason = 'NO_SPACE'
1270 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1271 self.reason = 'VALUE_TOO_LONG'
1272 else:
1273 self.reason = 'NOT_SUPPORTED'
1274
1275
1276 class XAttrUnavailableError(YoutubeDLError):
1277 pass
1278
1279
1280 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1281 hc = http_class(*args, **kwargs)
1282 source_address = ydl_handler._params.get('source_address')
1283
1284 if source_address is not None:
1285 # This is to workaround _create_connection() from socket where it will try all
1286 # address data from getaddrinfo() including IPv6. This filters the result from
1287 # getaddrinfo() based on the source_address value.
1288 # This is based on the cpython socket.create_connection() function.
1289 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1290 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1291 host, port = address
1292 err = None
1293 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1294 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1295 ip_addrs = [addr for addr in addrs if addr[0] == af]
1296 if addrs and not ip_addrs:
1297 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1298 raise OSError(
1299 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1300 % (ip_version, source_address[0]))
1301 for res in ip_addrs:
1302 af, socktype, proto, canonname, sa = res
1303 sock = None
1304 try:
1305 sock = socket.socket(af, socktype, proto)
1306 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1307 sock.settimeout(timeout)
1308 sock.bind(source_address)
1309 sock.connect(sa)
1310 err = None # Explicitly break reference cycle
1311 return sock
1312 except OSError as _:
1313 err = _
1314 if sock is not None:
1315 sock.close()
1316 if err is not None:
1317 raise err
1318 else:
1319 raise OSError('getaddrinfo returns an empty list')
1320 if hasattr(hc, '_create_connection'):
1321 hc._create_connection = _create_connection
1322 hc.source_address = (source_address, 0)
1323
1324 return hc
1325
1326
1327 class YoutubeDLHandler(urllib.request.HTTPHandler):
1328 """Handler for HTTP requests and responses.
1329
1330 This class, when installed with an OpenerDirector, automatically adds
1331 the standard headers to every HTTP request and handles gzipped, deflated and
1332 brotli responses from web servers.
1333
1334 Part of this code was copied from:
1335
1336 http://techknack.net/python-urllib2-handlers/
1337
1338 Andrew Rowls, the author of that code, agreed to release it to the
1339 public domain.
1340 """
1341
1342 def __init__(self, params, *args, **kwargs):
1343 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1344 self._params = params
1345
1346 def http_open(self, req):
1347 conn_class = http.client.HTTPConnection
1348
1349 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1350 if socks_proxy:
1351 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1352 del req.headers['Ytdl-socks-proxy']
1353
1354 return self.do_open(functools.partial(
1355 _create_http_connection, self, conn_class, False),
1356 req)
1357
1358 @staticmethod
1359 def deflate(data):
1360 if not data:
1361 return data
1362 try:
1363 return zlib.decompress(data, -zlib.MAX_WBITS)
1364 except zlib.error:
1365 return zlib.decompress(data)
1366
1367 @staticmethod
1368 def brotli(data):
1369 if not data:
1370 return data
1371 return brotli.decompress(data)
1372
1373 @staticmethod
1374 def gz(data):
1375 gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
1376 try:
1377 return gz.read()
1378 except OSError as original_oserror:
1379 # There may be junk add the end of the file
1380 # See http://stackoverflow.com/q/4928560/35070 for details
1381 for i in range(1, 1024):
1382 try:
1383 gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
1384 return gz.read()
1385 except OSError:
1386 continue
1387 else:
1388 raise original_oserror
1389
1390 def http_request(self, req):
1391 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1392 # always respected by websites, some tend to give out URLs with non percent-encoded
1393 # non-ASCII characters (see telemb.py, ard.py [#3412])
1394 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1395 # To work around aforementioned issue we will replace request's original URL with
1396 # percent-encoded one
1397 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1398 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1399 url = req.get_full_url()
1400 url_escaped = escape_url(url)
1401
1402 # Substitute URL if any change after escaping
1403 if url != url_escaped:
1404 req = update_Request(req, url=url_escaped)
1405
1406 for h, v in self._params.get('http_headers', std_headers).items():
1407 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1408 # The dict keys are capitalized because of this bug by urllib
1409 if h.capitalize() not in req.headers:
1410 req.add_header(h, v)
1411
1412 if 'Youtubedl-no-compression' in req.headers: # deprecated
1413 req.headers.pop('Youtubedl-no-compression', None)
1414 req.add_header('Accept-encoding', 'identity')
1415
1416 if 'Accept-encoding' not in req.headers:
1417 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1418
1419 return super().do_request_(req)
1420
1421 def http_response(self, req, resp):
1422 old_resp = resp
1423
1424 # Content-Encoding header lists the encodings in order that they were applied [1].
1425 # To decompress, we simply do the reverse.
1426 # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
1427 decoded_response = None
1428 for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
1429 if encoding == 'gzip':
1430 decoded_response = self.gz(decoded_response or resp.read())
1431 elif encoding == 'deflate':
1432 decoded_response = self.deflate(decoded_response or resp.read())
1433 elif encoding == 'br' and brotli:
1434 decoded_response = self.brotli(decoded_response or resp.read())
1435
1436 if decoded_response is not None:
1437 resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
1438 resp.msg = old_resp.msg
1439 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1440 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1441 if 300 <= resp.code < 400:
1442 location = resp.headers.get('Location')
1443 if location:
1444 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1445 location = location.encode('iso-8859-1').decode()
1446 location_escaped = escape_url(location)
1447 if location != location_escaped:
1448 del resp.headers['Location']
1449 resp.headers['Location'] = location_escaped
1450 return resp
1451
1452 https_request = http_request
1453 https_response = http_response
1454
1455
1456 def make_socks_conn_class(base_class, socks_proxy):
1457 assert issubclass(base_class, (
1458 http.client.HTTPConnection, http.client.HTTPSConnection))
1459
1460 url_components = urllib.parse.urlparse(socks_proxy)
1461 if url_components.scheme.lower() == 'socks5':
1462 socks_type = ProxyType.SOCKS5
1463 elif url_components.scheme.lower() in ('socks', 'socks4'):
1464 socks_type = ProxyType.SOCKS4
1465 elif url_components.scheme.lower() == 'socks4a':
1466 socks_type = ProxyType.SOCKS4A
1467
1468 def unquote_if_non_empty(s):
1469 if not s:
1470 return s
1471 return urllib.parse.unquote_plus(s)
1472
1473 proxy_args = (
1474 socks_type,
1475 url_components.hostname, url_components.port or 1080,
1476 True, # Remote DNS
1477 unquote_if_non_empty(url_components.username),
1478 unquote_if_non_empty(url_components.password),
1479 )
1480
1481 class SocksConnection(base_class):
1482 def connect(self):
1483 self.sock = sockssocket()
1484 self.sock.setproxy(*proxy_args)
1485 if isinstance(self.timeout, (int, float)):
1486 self.sock.settimeout(self.timeout)
1487 self.sock.connect((self.host, self.port))
1488
1489 if isinstance(self, http.client.HTTPSConnection):
1490 if hasattr(self, '_context'): # Python > 2.6
1491 self.sock = self._context.wrap_socket(
1492 self.sock, server_hostname=self.host)
1493 else:
1494 self.sock = ssl.wrap_socket(self.sock)
1495
1496 return SocksConnection
1497
1498
1499 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1500 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1501 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1502 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1503 self._params = params
1504
1505 def https_open(self, req):
1506 kwargs = {}
1507 conn_class = self._https_conn_class
1508
1509 if hasattr(self, '_context'): # python > 2.6
1510 kwargs['context'] = self._context
1511 if hasattr(self, '_check_hostname'): # python 3.x
1512 kwargs['check_hostname'] = self._check_hostname
1513
1514 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1515 if socks_proxy:
1516 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1517 del req.headers['Ytdl-socks-proxy']
1518
1519 try:
1520 return self.do_open(
1521 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1522 except urllib.error.URLError as e:
1523 if (isinstance(e.reason, ssl.SSLError)
1524 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1525 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1526 raise
1527
1528
1529 def is_path_like(f):
1530 return isinstance(f, (str, bytes, os.PathLike))
1531
1532
1533 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1534 def __init__(self, cookiejar=None):
1535 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1536
1537 def http_response(self, request, response):
1538 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1539
1540 https_request = urllib.request.HTTPCookieProcessor.http_request
1541 https_response = http_response
1542
1543
1544 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1545 """YoutubeDL redirect handler
1546
1547 The code is based on HTTPRedirectHandler implementation from CPython [1].
1548
1549 This redirect handler fixes and improves the logic to better align with RFC7261
1550 and what browsers tend to do [2][3]
1551
1552 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1553 2. https://datatracker.ietf.org/doc/html/rfc7231
1554 3. https://github.com/python/cpython/issues/91306
1555 """
1556
1557 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1558
1559 def redirect_request(self, req, fp, code, msg, headers, newurl):
1560 if code not in (301, 302, 303, 307, 308):
1561 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1562
1563 new_method = req.get_method()
1564 new_data = req.data
1565 remove_headers = []
1566 # A 303 must either use GET or HEAD for subsequent request
1567 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1568 if code == 303 and req.get_method() != 'HEAD':
1569 new_method = 'GET'
1570 # 301 and 302 redirects are commonly turned into a GET from a POST
1571 # for subsequent requests by browsers, so we'll do the same.
1572 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1573 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1574 elif code in (301, 302) and req.get_method() == 'POST':
1575 new_method = 'GET'
1576
1577 # only remove payload if method changed (e.g. POST to GET)
1578 if new_method != req.get_method():
1579 new_data = None
1580 remove_headers.extend(['Content-Length', 'Content-Type'])
1581
1582 new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
1583
1584 return urllib.request.Request(
1585 newurl, headers=new_headers, origin_req_host=req.origin_req_host,
1586 unverifiable=True, method=new_method, data=new_data)
1587
1588
1589 def extract_timezone(date_str):
1590 m = re.search(
1591 r'''(?x)
1592 ^.{8,}? # >=8 char non-TZ prefix, if present
1593 (?P<tz>Z| # just the UTC Z, or
1594 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1595 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1596 [ ]? # optional space
1597 (?P<sign>\+|-) # +/-
1598 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1599 $)
1600 ''', date_str)
1601 if not m:
1602 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1603 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1604 if timezone is not None:
1605 date_str = date_str[:-len(m.group('tz'))]
1606 timezone = datetime.timedelta(hours=timezone or 0)
1607 else:
1608 date_str = date_str[:-len(m.group('tz'))]
1609 if not m.group('sign'):
1610 timezone = datetime.timedelta()
1611 else:
1612 sign = 1 if m.group('sign') == '+' else -1
1613 timezone = datetime.timedelta(
1614 hours=sign * int(m.group('hours')),
1615 minutes=sign * int(m.group('minutes')))
1616 return timezone, date_str
1617
1618
1619 def parse_iso8601(date_str, delimiter='T', timezone=None):
1620 """ Return a UNIX timestamp from the given date """
1621
1622 if date_str is None:
1623 return None
1624
1625 date_str = re.sub(r'\.[0-9]+', '', date_str)
1626
1627 if timezone is None:
1628 timezone, date_str = extract_timezone(date_str)
1629
1630 with contextlib.suppress(ValueError):
1631 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1632 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1633 return calendar.timegm(dt.timetuple())
1634
1635
1636 def date_formats(day_first=True):
1637 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1638
1639
1640 def unified_strdate(date_str, day_first=True):
1641 """Return a string with the date in the format YYYYMMDD"""
1642
1643 if date_str is None:
1644 return None
1645 upload_date = None
1646 # Replace commas
1647 date_str = date_str.replace(',', ' ')
1648 # Remove AM/PM + timezone
1649 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1650 _, date_str = extract_timezone(date_str)
1651
1652 for expression in date_formats(day_first):
1653 with contextlib.suppress(ValueError):
1654 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1655 if upload_date is None:
1656 timetuple = email.utils.parsedate_tz(date_str)
1657 if timetuple:
1658 with contextlib.suppress(ValueError):
1659 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1660 if upload_date is not None:
1661 return str(upload_date)
1662
1663
1664 def unified_timestamp(date_str, day_first=True):
1665 if date_str is None:
1666 return None
1667
1668 date_str = re.sub(r'\s+', ' ', re.sub(
1669 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1670
1671 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1672 timezone, date_str = extract_timezone(date_str)
1673
1674 # Remove AM/PM + timezone
1675 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1676
1677 # Remove unrecognized timezones from ISO 8601 alike timestamps
1678 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1679 if m:
1680 date_str = date_str[:-len(m.group('tz'))]
1681
1682 # Python only supports microseconds, so remove nanoseconds
1683 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1684 if m:
1685 date_str = m.group(1)
1686
1687 for expression in date_formats(day_first):
1688 with contextlib.suppress(ValueError):
1689 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1690 return calendar.timegm(dt.timetuple())
1691
1692 timetuple = email.utils.parsedate_tz(date_str)
1693 if timetuple:
1694 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1695
1696
1697 def determine_ext(url, default_ext='unknown_video'):
1698 if url is None or '.' not in url:
1699 return default_ext
1700 guess = url.partition('?')[0].rpartition('.')[2]
1701 if re.match(r'^[A-Za-z0-9]+$', guess):
1702 return guess
1703 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1704 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1705 return guess.rstrip('/')
1706 else:
1707 return default_ext
1708
1709
1710 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1711 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1712
1713
1714 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1715 R"""
1716 Return a datetime object from a string.
1717 Supported format:
1718 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1719
1720 @param format strftime format of DATE
1721 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1722 auto: round to the unit provided in date_str (if applicable).
1723 """
1724 auto_precision = False
1725 if precision == 'auto':
1726 auto_precision = True
1727 precision = 'microsecond'
1728 today = datetime_round(datetime.datetime.utcnow(), precision)
1729 if date_str in ('now', 'today'):
1730 return today
1731 if date_str == 'yesterday':
1732 return today - datetime.timedelta(days=1)
1733 match = re.match(
1734 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1735 date_str)
1736 if match is not None:
1737 start_time = datetime_from_str(match.group('start'), precision, format)
1738 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1739 unit = match.group('unit')
1740 if unit == 'month' or unit == 'year':
1741 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1742 unit = 'day'
1743 else:
1744 if unit == 'week':
1745 unit = 'day'
1746 time *= 7
1747 delta = datetime.timedelta(**{unit + 's': time})
1748 new_date = start_time + delta
1749 if auto_precision:
1750 return datetime_round(new_date, unit)
1751 return new_date
1752
1753 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1754
1755
1756 def date_from_str(date_str, format='%Y%m%d', strict=False):
1757 R"""
1758 Return a date object from a string using datetime_from_str
1759
1760 @param strict Restrict allowed patterns to "YYYYMMDD" and
1761 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1762 """
1763 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1764 raise ValueError(f'Invalid date format "{date_str}"')
1765 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1766
1767
1768 def datetime_add_months(dt, months):
1769 """Increment/Decrement a datetime object by months."""
1770 month = dt.month + months - 1
1771 year = dt.year + month // 12
1772 month = month % 12 + 1
1773 day = min(dt.day, calendar.monthrange(year, month)[1])
1774 return dt.replace(year, month, day)
1775
1776
1777 def datetime_round(dt, precision='day'):
1778 """
1779 Round a datetime object's time to a specific precision
1780 """
1781 if precision == 'microsecond':
1782 return dt
1783
1784 unit_seconds = {
1785 'day': 86400,
1786 'hour': 3600,
1787 'minute': 60,
1788 'second': 1,
1789 }
1790 roundto = lambda x, n: ((x + n / 2) // n) * n
1791 timestamp = calendar.timegm(dt.timetuple())
1792 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1793
1794
1795 def hyphenate_date(date_str):
1796 """
1797 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1798 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1799 if match is not None:
1800 return '-'.join(match.groups())
1801 else:
1802 return date_str
1803
1804
1805 class DateRange:
1806 """Represents a time interval between two dates"""
1807
1808 def __init__(self, start=None, end=None):
1809 """start and end must be strings in the format accepted by date"""
1810 if start is not None:
1811 self.start = date_from_str(start, strict=True)
1812 else:
1813 self.start = datetime.datetime.min.date()
1814 if end is not None:
1815 self.end = date_from_str(end, strict=True)
1816 else:
1817 self.end = datetime.datetime.max.date()
1818 if self.start > self.end:
1819 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1820
1821 @classmethod
1822 def day(cls, day):
1823 """Returns a range that only contains the given day"""
1824 return cls(day, day)
1825
1826 def __contains__(self, date):
1827 """Check if the date is in the range"""
1828 if not isinstance(date, datetime.date):
1829 date = date_from_str(date)
1830 return self.start <= date <= self.end
1831
1832 def __repr__(self):
1833 return f'{__name__}.{type(self).__name__}({self.start.isoformat()!r}, {self.end.isoformat()!r})'
1834
1835 def __eq__(self, other):
1836 return (isinstance(other, DateRange)
1837 and self.start == other.start and self.end == other.end)
1838
1839
1840 @functools.cache
1841 def system_identifier():
1842 python_implementation = platform.python_implementation()
1843 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1844 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1845 libc_ver = []
1846 with contextlib.suppress(OSError): # We may not have access to the executable
1847 libc_ver = platform.libc_ver()
1848
1849 return 'Python %s (%s %s %s) - %s (%s%s)' % (
1850 platform.python_version(),
1851 python_implementation,
1852 platform.machine(),
1853 platform.architecture()[0],
1854 platform.platform(),
1855 ssl.OPENSSL_VERSION,
1856 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
1857 )
1858
1859
1860 @functools.cache
1861 def get_windows_version():
1862 ''' Get Windows version. returns () if it's not running on Windows '''
1863 if compat_os_name == 'nt':
1864 return version_tuple(platform.win32_ver()[1])
1865 else:
1866 return ()
1867
1868
1869 def write_string(s, out=None, encoding=None):
1870 assert isinstance(s, str)
1871 out = out or sys.stderr
1872 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
1873 if not out:
1874 return
1875
1876 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1877 s = re.sub(r'([\r\n]+)', r' \1', s)
1878
1879 enc, buffer = None, out
1880 if 'b' in getattr(out, 'mode', ''):
1881 enc = encoding or preferredencoding()
1882 elif hasattr(out, 'buffer'):
1883 buffer = out.buffer
1884 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1885
1886 buffer.write(s.encode(enc, 'ignore') if enc else s)
1887 out.flush()
1888
1889
1890 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
1891 from .. import _IN_CLI
1892 if _IN_CLI:
1893 if msg in deprecation_warning._cache:
1894 return
1895 deprecation_warning._cache.add(msg)
1896 if printer:
1897 return printer(f'{msg}{bug_reports_message()}', **kwargs)
1898 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
1899 else:
1900 import warnings
1901 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
1902
1903
1904 deprecation_warning._cache = set()
1905
1906
1907 def bytes_to_intlist(bs):
1908 if not bs:
1909 return []
1910 if isinstance(bs[0], int): # Python 3
1911 return list(bs)
1912 else:
1913 return [ord(c) for c in bs]
1914
1915
1916 def intlist_to_bytes(xs):
1917 if not xs:
1918 return b''
1919 return struct.pack('%dB' % len(xs), *xs)
1920
1921
1922 class LockingUnsupportedError(OSError):
1923 msg = 'File locking is not supported'
1924
1925 def __init__(self):
1926 super().__init__(self.msg)
1927
1928
1929 # Cross-platform file locking
1930 if sys.platform == 'win32':
1931 import ctypes
1932 import ctypes.wintypes
1933 import msvcrt
1934
1935 class OVERLAPPED(ctypes.Structure):
1936 _fields_ = [
1937 ('Internal', ctypes.wintypes.LPVOID),
1938 ('InternalHigh', ctypes.wintypes.LPVOID),
1939 ('Offset', ctypes.wintypes.DWORD),
1940 ('OffsetHigh', ctypes.wintypes.DWORD),
1941 ('hEvent', ctypes.wintypes.HANDLE),
1942 ]
1943
1944 kernel32 = ctypes.WinDLL('kernel32')
1945 LockFileEx = kernel32.LockFileEx
1946 LockFileEx.argtypes = [
1947 ctypes.wintypes.HANDLE, # hFile
1948 ctypes.wintypes.DWORD, # dwFlags
1949 ctypes.wintypes.DWORD, # dwReserved
1950 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1951 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1952 ctypes.POINTER(OVERLAPPED) # Overlapped
1953 ]
1954 LockFileEx.restype = ctypes.wintypes.BOOL
1955 UnlockFileEx = kernel32.UnlockFileEx
1956 UnlockFileEx.argtypes = [
1957 ctypes.wintypes.HANDLE, # hFile
1958 ctypes.wintypes.DWORD, # dwReserved
1959 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1960 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1961 ctypes.POINTER(OVERLAPPED) # Overlapped
1962 ]
1963 UnlockFileEx.restype = ctypes.wintypes.BOOL
1964 whole_low = 0xffffffff
1965 whole_high = 0x7fffffff
1966
1967 def _lock_file(f, exclusive, block):
1968 overlapped = OVERLAPPED()
1969 overlapped.Offset = 0
1970 overlapped.OffsetHigh = 0
1971 overlapped.hEvent = 0
1972 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1973
1974 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1975 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1976 0, whole_low, whole_high, f._lock_file_overlapped_p):
1977 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
1978 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
1979
1980 def _unlock_file(f):
1981 assert f._lock_file_overlapped_p
1982 handle = msvcrt.get_osfhandle(f.fileno())
1983 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1984 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1985
1986 else:
1987 try:
1988 import fcntl
1989
1990 def _lock_file(f, exclusive, block):
1991 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1992 if not block:
1993 flags |= fcntl.LOCK_NB
1994 try:
1995 fcntl.flock(f, flags)
1996 except BlockingIOError:
1997 raise
1998 except OSError: # AOSP does not have flock()
1999 fcntl.lockf(f, flags)
2000
2001 def _unlock_file(f):
2002 with contextlib.suppress(OSError):
2003 return fcntl.flock(f, fcntl.LOCK_UN)
2004 with contextlib.suppress(OSError):
2005 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
2006 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
2007
2008 except ImportError:
2009
2010 def _lock_file(f, exclusive, block):
2011 raise LockingUnsupportedError()
2012
2013 def _unlock_file(f):
2014 raise LockingUnsupportedError()
2015
2016
2017 class locked_file:
2018 locked = False
2019
2020 def __init__(self, filename, mode, block=True, encoding=None):
2021 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2022 raise NotImplementedError(mode)
2023 self.mode, self.block = mode, block
2024
2025 writable = any(f in mode for f in 'wax+')
2026 readable = any(f in mode for f in 'r+')
2027 flags = functools.reduce(operator.ior, (
2028 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2029 getattr(os, 'O_BINARY', 0), # Windows only
2030 getattr(os, 'O_NOINHERIT', 0), # Windows only
2031 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2032 os.O_APPEND if 'a' in mode else 0,
2033 os.O_EXCL if 'x' in mode else 0,
2034 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2035 ))
2036
2037 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2038
2039 def __enter__(self):
2040 exclusive = 'r' not in self.mode
2041 try:
2042 _lock_file(self.f, exclusive, self.block)
2043 self.locked = True
2044 except OSError:
2045 self.f.close()
2046 raise
2047 if 'w' in self.mode:
2048 try:
2049 self.f.truncate()
2050 except OSError as e:
2051 if e.errno not in (
2052 errno.ESPIPE, # Illegal seek - expected for FIFO
2053 errno.EINVAL, # Invalid argument - expected for /dev/null
2054 ):
2055 raise
2056 return self
2057
2058 def unlock(self):
2059 if not self.locked:
2060 return
2061 try:
2062 _unlock_file(self.f)
2063 finally:
2064 self.locked = False
2065
2066 def __exit__(self, *_):
2067 try:
2068 self.unlock()
2069 finally:
2070 self.f.close()
2071
2072 open = __enter__
2073 close = __exit__
2074
2075 def __getattr__(self, attr):
2076 return getattr(self.f, attr)
2077
2078 def __iter__(self):
2079 return iter(self.f)
2080
2081
2082 @functools.cache
2083 def get_filesystem_encoding():
2084 encoding = sys.getfilesystemencoding()
2085 return encoding if encoding is not None else 'utf-8'
2086
2087
2088 def shell_quote(args):
2089 quoted_args = []
2090 encoding = get_filesystem_encoding()
2091 for a in args:
2092 if isinstance(a, bytes):
2093 # We may get a filename encoded with 'encodeFilename'
2094 a = a.decode(encoding)
2095 quoted_args.append(compat_shlex_quote(a))
2096 return ' '.join(quoted_args)
2097
2098
2099 def smuggle_url(url, data):
2100 """ Pass additional data in a URL for internal use. """
2101
2102 url, idata = unsmuggle_url(url, {})
2103 data.update(idata)
2104 sdata = urllib.parse.urlencode(
2105 {'__youtubedl_smuggle': json.dumps(data)})
2106 return url + '#' + sdata
2107
2108
2109 def unsmuggle_url(smug_url, default=None):
2110 if '#__youtubedl_smuggle' not in smug_url:
2111 return smug_url, default
2112 url, _, sdata = smug_url.rpartition('#')
2113 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2114 data = json.loads(jsond)
2115 return url, data
2116
2117
2118 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2119 """ Formats numbers with decimal sufixes like K, M, etc """
2120 num, factor = float_or_none(num), float(factor)
2121 if num is None or num < 0:
2122 return None
2123 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2124 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2125 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2126 if factor == 1024:
2127 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2128 converted = num / (factor ** exponent)
2129 return fmt % (converted, suffix)
2130
2131
2132 def format_bytes(bytes):
2133 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2134
2135
2136 def lookup_unit_table(unit_table, s, strict=False):
2137 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2138 units_re = '|'.join(re.escape(u) for u in unit_table)
2139 m = (re.fullmatch if strict else re.match)(
2140 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2141 if not m:
2142 return None
2143
2144 num = float(m.group('num').replace(',', '.'))
2145 mult = unit_table[m.group('unit')]
2146 return round(num * mult)
2147
2148
2149 def parse_bytes(s):
2150 """Parse a string indicating a byte quantity into an integer"""
2151 return lookup_unit_table(
2152 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2153 s.upper(), strict=True)
2154
2155
2156 def parse_filesize(s):
2157 if s is None:
2158 return None
2159
2160 # The lower-case forms are of course incorrect and unofficial,
2161 # but we support those too
2162 _UNIT_TABLE = {
2163 'B': 1,
2164 'b': 1,
2165 'bytes': 1,
2166 'KiB': 1024,
2167 'KB': 1000,
2168 'kB': 1024,
2169 'Kb': 1000,
2170 'kb': 1000,
2171 'kilobytes': 1000,
2172 'kibibytes': 1024,
2173 'MiB': 1024 ** 2,
2174 'MB': 1000 ** 2,
2175 'mB': 1024 ** 2,
2176 'Mb': 1000 ** 2,
2177 'mb': 1000 ** 2,
2178 'megabytes': 1000 ** 2,
2179 'mebibytes': 1024 ** 2,
2180 'GiB': 1024 ** 3,
2181 'GB': 1000 ** 3,
2182 'gB': 1024 ** 3,
2183 'Gb': 1000 ** 3,
2184 'gb': 1000 ** 3,
2185 'gigabytes': 1000 ** 3,
2186 'gibibytes': 1024 ** 3,
2187 'TiB': 1024 ** 4,
2188 'TB': 1000 ** 4,
2189 'tB': 1024 ** 4,
2190 'Tb': 1000 ** 4,
2191 'tb': 1000 ** 4,
2192 'terabytes': 1000 ** 4,
2193 'tebibytes': 1024 ** 4,
2194 'PiB': 1024 ** 5,
2195 'PB': 1000 ** 5,
2196 'pB': 1024 ** 5,
2197 'Pb': 1000 ** 5,
2198 'pb': 1000 ** 5,
2199 'petabytes': 1000 ** 5,
2200 'pebibytes': 1024 ** 5,
2201 'EiB': 1024 ** 6,
2202 'EB': 1000 ** 6,
2203 'eB': 1024 ** 6,
2204 'Eb': 1000 ** 6,
2205 'eb': 1000 ** 6,
2206 'exabytes': 1000 ** 6,
2207 'exbibytes': 1024 ** 6,
2208 'ZiB': 1024 ** 7,
2209 'ZB': 1000 ** 7,
2210 'zB': 1024 ** 7,
2211 'Zb': 1000 ** 7,
2212 'zb': 1000 ** 7,
2213 'zettabytes': 1000 ** 7,
2214 'zebibytes': 1024 ** 7,
2215 'YiB': 1024 ** 8,
2216 'YB': 1000 ** 8,
2217 'yB': 1024 ** 8,
2218 'Yb': 1000 ** 8,
2219 'yb': 1000 ** 8,
2220 'yottabytes': 1000 ** 8,
2221 'yobibytes': 1024 ** 8,
2222 }
2223
2224 return lookup_unit_table(_UNIT_TABLE, s)
2225
2226
2227 def parse_count(s):
2228 if s is None:
2229 return None
2230
2231 s = re.sub(r'^[^\d]+\s', '', s).strip()
2232
2233 if re.match(r'^[\d,.]+$', s):
2234 return str_to_int(s)
2235
2236 _UNIT_TABLE = {
2237 'k': 1000,
2238 'K': 1000,
2239 'm': 1000 ** 2,
2240 'M': 1000 ** 2,
2241 'kk': 1000 ** 2,
2242 'KK': 1000 ** 2,
2243 'b': 1000 ** 3,
2244 'B': 1000 ** 3,
2245 }
2246
2247 ret = lookup_unit_table(_UNIT_TABLE, s)
2248 if ret is not None:
2249 return ret
2250
2251 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2252 if mobj:
2253 return str_to_int(mobj.group(1))
2254
2255
2256 def parse_resolution(s, *, lenient=False):
2257 if s is None:
2258 return {}
2259
2260 if lenient:
2261 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2262 else:
2263 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2264 if mobj:
2265 return {
2266 'width': int(mobj.group('w')),
2267 'height': int(mobj.group('h')),
2268 }
2269
2270 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2271 if mobj:
2272 return {'height': int(mobj.group(1))}
2273
2274 mobj = re.search(r'\b([48])[kK]\b', s)
2275 if mobj:
2276 return {'height': int(mobj.group(1)) * 540}
2277
2278 return {}
2279
2280
2281 def parse_bitrate(s):
2282 if not isinstance(s, str):
2283 return
2284 mobj = re.search(r'\b(\d+)\s*kbps', s)
2285 if mobj:
2286 return int(mobj.group(1))
2287
2288
2289 def month_by_name(name, lang='en'):
2290 """ Return the number of a month by (locale-independently) English name """
2291
2292 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2293
2294 try:
2295 return month_names.index(name) + 1
2296 except ValueError:
2297 return None
2298
2299
2300 def month_by_abbreviation(abbrev):
2301 """ Return the number of a month by (locale-independently) English
2302 abbreviations """
2303
2304 try:
2305 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2306 except ValueError:
2307 return None
2308
2309
2310 def fix_xml_ampersands(xml_str):
2311 """Replace all the '&' by '&amp;' in XML"""
2312 return re.sub(
2313 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2314 '&amp;',
2315 xml_str)
2316
2317
2318 def setproctitle(title):
2319 assert isinstance(title, str)
2320
2321 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2322 try:
2323 import ctypes
2324 except ImportError:
2325 return
2326
2327 try:
2328 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2329 except OSError:
2330 return
2331 except TypeError:
2332 # LoadLibrary in Windows Python 2.7.13 only expects
2333 # a bytestring, but since unicode_literals turns
2334 # every string into a unicode string, it fails.
2335 return
2336 title_bytes = title.encode()
2337 buf = ctypes.create_string_buffer(len(title_bytes))
2338 buf.value = title_bytes
2339 try:
2340 libc.prctl(15, buf, 0, 0, 0)
2341 except AttributeError:
2342 return # Strange libc, just skip this
2343
2344
2345 def remove_start(s, start):
2346 return s[len(start):] if s is not None and s.startswith(start) else s
2347
2348
2349 def remove_end(s, end):
2350 return s[:-len(end)] if s is not None and s.endswith(end) else s
2351
2352
2353 def remove_quotes(s):
2354 if s is None or len(s) < 2:
2355 return s
2356 for quote in ('"', "'", ):
2357 if s[0] == quote and s[-1] == quote:
2358 return s[1:-1]
2359 return s
2360
2361
2362 def get_domain(url):
2363 """
2364 This implementation is inconsistent, but is kept for compatibility.
2365 Use this only for "webpage_url_domain"
2366 """
2367 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2368
2369
2370 def url_basename(url):
2371 path = urllib.parse.urlparse(url).path
2372 return path.strip('/').split('/')[-1]
2373
2374
2375 def base_url(url):
2376 return re.match(r'https?://[^?#]+/', url).group()
2377
2378
2379 def urljoin(base, path):
2380 if isinstance(path, bytes):
2381 path = path.decode()
2382 if not isinstance(path, str) or not path:
2383 return None
2384 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2385 return path
2386 if isinstance(base, bytes):
2387 base = base.decode()
2388 if not isinstance(base, str) or not re.match(
2389 r'^(?:https?:)?//', base):
2390 return None
2391 return urllib.parse.urljoin(base, path)
2392
2393
2394 class HEADRequest(urllib.request.Request):
2395 def get_method(self):
2396 return 'HEAD'
2397
2398
2399 class PUTRequest(urllib.request.Request):
2400 def get_method(self):
2401 return 'PUT'
2402
2403
2404 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2405 if get_attr and v is not None:
2406 v = getattr(v, get_attr, None)
2407 try:
2408 return int(v) * invscale // scale
2409 except (ValueError, TypeError, OverflowError):
2410 return default
2411
2412
2413 def str_or_none(v, default=None):
2414 return default if v is None else str(v)
2415
2416
2417 def str_to_int(int_str):
2418 """ A more relaxed version of int_or_none """
2419 if isinstance(int_str, int):
2420 return int_str
2421 elif isinstance(int_str, str):
2422 int_str = re.sub(r'[,\.\+]', '', int_str)
2423 return int_or_none(int_str)
2424
2425
2426 def float_or_none(v, scale=1, invscale=1, default=None):
2427 if v is None:
2428 return default
2429 try:
2430 return float(v) * invscale / scale
2431 except (ValueError, TypeError):
2432 return default
2433
2434
2435 def bool_or_none(v, default=None):
2436 return v if isinstance(v, bool) else default
2437
2438
2439 def strip_or_none(v, default=None):
2440 return v.strip() if isinstance(v, str) else default
2441
2442
2443 def url_or_none(url):
2444 if not url or not isinstance(url, str):
2445 return None
2446 url = url.strip()
2447 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2448
2449
2450 def request_to_url(req):
2451 if isinstance(req, urllib.request.Request):
2452 return req.get_full_url()
2453 else:
2454 return req
2455
2456
2457 def strftime_or_none(timestamp, date_format, default=None):
2458 datetime_object = None
2459 try:
2460 if isinstance(timestamp, (int, float)): # unix timestamp
2461 # Using naive datetime here can break timestamp() in Windows
2462 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2463 # Also, datetime.datetime.fromtimestamp breaks for negative timestamps
2464 # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642
2465 datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc)
2466 + datetime.timedelta(seconds=timestamp))
2467 elif isinstance(timestamp, str): # assume YYYYMMDD
2468 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2469 date_format = re.sub( # Support %s on windows
2470 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2471 return datetime_object.strftime(date_format)
2472 except (ValueError, TypeError, AttributeError):
2473 return default
2474
2475
2476 def parse_duration(s):
2477 if not isinstance(s, str):
2478 return None
2479 s = s.strip()
2480 if not s:
2481 return None
2482
2483 days, hours, mins, secs, ms = [None] * 5
2484 m = re.match(r'''(?x)
2485 (?P<before_secs>
2486 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2487 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2488 (?P<ms>[.:][0-9]+)?Z?$
2489 ''', s)
2490 if m:
2491 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2492 else:
2493 m = re.match(
2494 r'''(?ix)(?:P?
2495 (?:
2496 [0-9]+\s*y(?:ears?)?,?\s*
2497 )?
2498 (?:
2499 [0-9]+\s*m(?:onths?)?,?\s*
2500 )?
2501 (?:
2502 [0-9]+\s*w(?:eeks?)?,?\s*
2503 )?
2504 (?:
2505 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2506 )?
2507 T)?
2508 (?:
2509 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2510 )?
2511 (?:
2512 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2513 )?
2514 (?:
2515 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2516 )?Z?$''', s)
2517 if m:
2518 days, hours, mins, secs, ms = m.groups()
2519 else:
2520 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2521 if m:
2522 hours, mins = m.groups()
2523 else:
2524 return None
2525
2526 if ms:
2527 ms = ms.replace(':', '.')
2528 return sum(float(part or 0) * mult for part, mult in (
2529 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2530
2531
2532 def prepend_extension(filename, ext, expected_real_ext=None):
2533 name, real_ext = os.path.splitext(filename)
2534 return (
2535 f'{name}.{ext}{real_ext}'
2536 if not expected_real_ext or real_ext[1:] == expected_real_ext
2537 else f'{filename}.{ext}')
2538
2539
2540 def replace_extension(filename, ext, expected_real_ext=None):
2541 name, real_ext = os.path.splitext(filename)
2542 return '{}.{}'.format(
2543 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2544 ext)
2545
2546
2547 def check_executable(exe, args=[]):
2548 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2549 args can be a list of arguments for a short output (like -version) """
2550 try:
2551 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2552 except OSError:
2553 return False
2554 return exe
2555
2556
2557 def _get_exe_version_output(exe, args):
2558 try:
2559 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2560 # SIGTTOU if yt-dlp is run in the background.
2561 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2562 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2563 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2564 if ret:
2565 return None
2566 except OSError:
2567 return False
2568 return stdout
2569
2570
2571 def detect_exe_version(output, version_re=None, unrecognized='present'):
2572 assert isinstance(output, str)
2573 if version_re is None:
2574 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2575 m = re.search(version_re, output)
2576 if m:
2577 return m.group(1)
2578 else:
2579 return unrecognized
2580
2581
2582 def get_exe_version(exe, args=['--version'],
2583 version_re=None, unrecognized=('present', 'broken')):
2584 """ Returns the version of the specified executable,
2585 or False if the executable is not present """
2586 unrecognized = variadic(unrecognized)
2587 assert len(unrecognized) in (1, 2)
2588 out = _get_exe_version_output(exe, args)
2589 if out is None:
2590 return unrecognized[-1]
2591 return out and detect_exe_version(out, version_re, unrecognized[0])
2592
2593
2594 def frange(start=0, stop=None, step=1):
2595 """Float range"""
2596 if stop is None:
2597 start, stop = 0, start
2598 sign = [-1, 1][step > 0] if step else 0
2599 while sign * start < sign * stop:
2600 yield start
2601 start += step
2602
2603
2604 class LazyList(collections.abc.Sequence):
2605 """Lazy immutable list from an iterable
2606 Note that slices of a LazyList are lists and not LazyList"""
2607
2608 class IndexError(IndexError):
2609 pass
2610
2611 def __init__(self, iterable, *, reverse=False, _cache=None):
2612 self._iterable = iter(iterable)
2613 self._cache = [] if _cache is None else _cache
2614 self._reversed = reverse
2615
2616 def __iter__(self):
2617 if self._reversed:
2618 # We need to consume the entire iterable to iterate in reverse
2619 yield from self.exhaust()
2620 return
2621 yield from self._cache
2622 for item in self._iterable:
2623 self._cache.append(item)
2624 yield item
2625
2626 def _exhaust(self):
2627 self._cache.extend(self._iterable)
2628 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2629 return self._cache
2630
2631 def exhaust(self):
2632 """Evaluate the entire iterable"""
2633 return self._exhaust()[::-1 if self._reversed else 1]
2634
2635 @staticmethod
2636 def _reverse_index(x):
2637 return None if x is None else ~x
2638
2639 def __getitem__(self, idx):
2640 if isinstance(idx, slice):
2641 if self._reversed:
2642 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2643 start, stop, step = idx.start, idx.stop, idx.step or 1
2644 elif isinstance(idx, int):
2645 if self._reversed:
2646 idx = self._reverse_index(idx)
2647 start, stop, step = idx, idx, 0
2648 else:
2649 raise TypeError('indices must be integers or slices')
2650 if ((start or 0) < 0 or (stop or 0) < 0
2651 or (start is None and step < 0)
2652 or (stop is None and step > 0)):
2653 # We need to consume the entire iterable to be able to slice from the end
2654 # Obviously, never use this with infinite iterables
2655 self._exhaust()
2656 try:
2657 return self._cache[idx]
2658 except IndexError as e:
2659 raise self.IndexError(e) from e
2660 n = max(start or 0, stop or 0) - len(self._cache) + 1
2661 if n > 0:
2662 self._cache.extend(itertools.islice(self._iterable, n))
2663 try:
2664 return self._cache[idx]
2665 except IndexError as e:
2666 raise self.IndexError(e) from e
2667
2668 def __bool__(self):
2669 try:
2670 self[-1] if self._reversed else self[0]
2671 except self.IndexError:
2672 return False
2673 return True
2674
2675 def __len__(self):
2676 self._exhaust()
2677 return len(self._cache)
2678
2679 def __reversed__(self):
2680 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2681
2682 def __copy__(self):
2683 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2684
2685 def __repr__(self):
2686 # repr and str should mimic a list. So we exhaust the iterable
2687 return repr(self.exhaust())
2688
2689 def __str__(self):
2690 return repr(self.exhaust())
2691
2692
2693 class PagedList:
2694
2695 class IndexError(IndexError):
2696 pass
2697
2698 def __len__(self):
2699 # This is only useful for tests
2700 return len(self.getslice())
2701
2702 def __init__(self, pagefunc, pagesize, use_cache=True):
2703 self._pagefunc = pagefunc
2704 self._pagesize = pagesize
2705 self._pagecount = float('inf')
2706 self._use_cache = use_cache
2707 self._cache = {}
2708
2709 def getpage(self, pagenum):
2710 page_results = self._cache.get(pagenum)
2711 if page_results is None:
2712 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2713 if self._use_cache:
2714 self._cache[pagenum] = page_results
2715 return page_results
2716
2717 def getslice(self, start=0, end=None):
2718 return list(self._getslice(start, end))
2719
2720 def _getslice(self, start, end):
2721 raise NotImplementedError('This method must be implemented by subclasses')
2722
2723 def __getitem__(self, idx):
2724 assert self._use_cache, 'Indexing PagedList requires cache'
2725 if not isinstance(idx, int) or idx < 0:
2726 raise TypeError('indices must be non-negative integers')
2727 entries = self.getslice(idx, idx + 1)
2728 if not entries:
2729 raise self.IndexError()
2730 return entries[0]
2731
2732
2733 class OnDemandPagedList(PagedList):
2734 """Download pages until a page with less than maximum results"""
2735
2736 def _getslice(self, start, end):
2737 for pagenum in itertools.count(start // self._pagesize):
2738 firstid = pagenum * self._pagesize
2739 nextfirstid = pagenum * self._pagesize + self._pagesize
2740 if start >= nextfirstid:
2741 continue
2742
2743 startv = (
2744 start % self._pagesize
2745 if firstid <= start < nextfirstid
2746 else 0)
2747 endv = (
2748 ((end - 1) % self._pagesize) + 1
2749 if (end is not None and firstid <= end <= nextfirstid)
2750 else None)
2751
2752 try:
2753 page_results = self.getpage(pagenum)
2754 except Exception:
2755 self._pagecount = pagenum - 1
2756 raise
2757 if startv != 0 or endv is not None:
2758 page_results = page_results[startv:endv]
2759 yield from page_results
2760
2761 # A little optimization - if current page is not "full", ie. does
2762 # not contain page_size videos then we can assume that this page
2763 # is the last one - there are no more ids on further pages -
2764 # i.e. no need to query again.
2765 if len(page_results) + startv < self._pagesize:
2766 break
2767
2768 # If we got the whole page, but the next page is not interesting,
2769 # break out early as well
2770 if end == nextfirstid:
2771 break
2772
2773
2774 class InAdvancePagedList(PagedList):
2775 """PagedList with total number of pages known in advance"""
2776
2777 def __init__(self, pagefunc, pagecount, pagesize):
2778 PagedList.__init__(self, pagefunc, pagesize, True)
2779 self._pagecount = pagecount
2780
2781 def _getslice(self, start, end):
2782 start_page = start // self._pagesize
2783 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2784 skip_elems = start - start_page * self._pagesize
2785 only_more = None if end is None else end - start
2786 for pagenum in range(start_page, end_page):
2787 page_results = self.getpage(pagenum)
2788 if skip_elems:
2789 page_results = page_results[skip_elems:]
2790 skip_elems = None
2791 if only_more is not None:
2792 if len(page_results) < only_more:
2793 only_more -= len(page_results)
2794 else:
2795 yield from page_results[:only_more]
2796 break
2797 yield from page_results
2798
2799
2800 class PlaylistEntries:
2801 MissingEntry = object()
2802 is_exhausted = False
2803
2804 def __init__(self, ydl, info_dict):
2805 self.ydl = ydl
2806
2807 # _entries must be assigned now since infodict can change during iteration
2808 entries = info_dict.get('entries')
2809 if entries is None:
2810 raise EntryNotInPlaylist('There are no entries')
2811 elif isinstance(entries, list):
2812 self.is_exhausted = True
2813
2814 requested_entries = info_dict.get('requested_entries')
2815 self.is_incomplete = requested_entries is not None
2816 if self.is_incomplete:
2817 assert self.is_exhausted
2818 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2819 for i, entry in zip(requested_entries, entries):
2820 self._entries[i - 1] = entry
2821 elif isinstance(entries, (list, PagedList, LazyList)):
2822 self._entries = entries
2823 else:
2824 self._entries = LazyList(entries)
2825
2826 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2827 (?P<start>[+-]?\d+)?
2828 (?P<range>[:-]
2829 (?P<end>[+-]?\d+|inf(?:inite)?)?
2830 (?::(?P<step>[+-]?\d+))?
2831 )?''')
2832
2833 @classmethod
2834 def parse_playlist_items(cls, string):
2835 for segment in string.split(','):
2836 if not segment:
2837 raise ValueError('There is two or more consecutive commas')
2838 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2839 if not mobj:
2840 raise ValueError(f'{segment!r} is not a valid specification')
2841 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2842 if int_or_none(step) == 0:
2843 raise ValueError(f'Step in {segment!r} cannot be zero')
2844 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2845
2846 def get_requested_items(self):
2847 playlist_items = self.ydl.params.get('playlist_items')
2848 playlist_start = self.ydl.params.get('playliststart', 1)
2849 playlist_end = self.ydl.params.get('playlistend')
2850 # For backwards compatibility, interpret -1 as whole list
2851 if playlist_end in (-1, None):
2852 playlist_end = ''
2853 if not playlist_items:
2854 playlist_items = f'{playlist_start}:{playlist_end}'
2855 elif playlist_start != 1 or playlist_end:
2856 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2857
2858 for index in self.parse_playlist_items(playlist_items):
2859 for i, entry in self[index]:
2860 yield i, entry
2861 if not entry:
2862 continue
2863 try:
2864 # The item may have just been added to archive. Don't break due to it
2865 if not self.ydl.params.get('lazy_playlist'):
2866 # TODO: Add auto-generated fields
2867 self.ydl._match_entry(entry, incomplete=True, silent=True)
2868 except (ExistingVideoReached, RejectedVideoReached):
2869 return
2870
2871 def get_full_count(self):
2872 if self.is_exhausted and not self.is_incomplete:
2873 return len(self)
2874 elif isinstance(self._entries, InAdvancePagedList):
2875 if self._entries._pagesize == 1:
2876 return self._entries._pagecount
2877
2878 @functools.cached_property
2879 def _getter(self):
2880 if isinstance(self._entries, list):
2881 def get_entry(i):
2882 try:
2883 entry = self._entries[i]
2884 except IndexError:
2885 entry = self.MissingEntry
2886 if not self.is_incomplete:
2887 raise self.IndexError()
2888 if entry is self.MissingEntry:
2889 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
2890 return entry
2891 else:
2892 def get_entry(i):
2893 try:
2894 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2895 except (LazyList.IndexError, PagedList.IndexError):
2896 raise self.IndexError()
2897 return get_entry
2898
2899 def __getitem__(self, idx):
2900 if isinstance(idx, int):
2901 idx = slice(idx, idx)
2902
2903 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2904 step = 1 if idx.step is None else idx.step
2905 if idx.start is None:
2906 start = 0 if step > 0 else len(self) - 1
2907 else:
2908 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2909
2910 # NB: Do not call len(self) when idx == [:]
2911 if idx.stop is None:
2912 stop = 0 if step < 0 else float('inf')
2913 else:
2914 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2915 stop += [-1, 1][step > 0]
2916
2917 for i in frange(start, stop, step):
2918 if i < 0:
2919 continue
2920 try:
2921 entry = self._getter(i)
2922 except self.IndexError:
2923 self.is_exhausted = True
2924 if step > 0:
2925 break
2926 continue
2927 yield i + 1, entry
2928
2929 def __len__(self):
2930 return len(tuple(self[:]))
2931
2932 class IndexError(IndexError):
2933 pass
2934
2935
2936 def uppercase_escape(s):
2937 unicode_escape = codecs.getdecoder('unicode_escape')
2938 return re.sub(
2939 r'\\U[0-9a-fA-F]{8}',
2940 lambda m: unicode_escape(m.group(0))[0],
2941 s)
2942
2943
2944 def lowercase_escape(s):
2945 unicode_escape = codecs.getdecoder('unicode_escape')
2946 return re.sub(
2947 r'\\u[0-9a-fA-F]{4}',
2948 lambda m: unicode_escape(m.group(0))[0],
2949 s)
2950
2951
2952 def escape_rfc3986(s):
2953 """Escape non-ASCII characters as suggested by RFC 3986"""
2954 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2955
2956
2957 def escape_url(url):
2958 """Escape URL as suggested by RFC 3986"""
2959 url_parsed = urllib.parse.urlparse(url)
2960 return url_parsed._replace(
2961 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2962 path=escape_rfc3986(url_parsed.path),
2963 params=escape_rfc3986(url_parsed.params),
2964 query=escape_rfc3986(url_parsed.query),
2965 fragment=escape_rfc3986(url_parsed.fragment)
2966 ).geturl()
2967
2968
2969 def parse_qs(url, **kwargs):
2970 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
2971
2972
2973 def read_batch_urls(batch_fd):
2974 def fixup(url):
2975 if not isinstance(url, str):
2976 url = url.decode('utf-8', 'replace')
2977 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2978 for bom in BOM_UTF8:
2979 if url.startswith(bom):
2980 url = url[len(bom):]
2981 url = url.lstrip()
2982 if not url or url.startswith(('#', ';', ']')):
2983 return False
2984 # "#" cannot be stripped out since it is part of the URI
2985 # However, it can be safely stripped out if following a whitespace
2986 return re.split(r'\s#', url, 1)[0].rstrip()
2987
2988 with contextlib.closing(batch_fd) as fd:
2989 return [url for url in map(fixup, fd) if url]
2990
2991
2992 def urlencode_postdata(*args, **kargs):
2993 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
2994
2995
2996 def update_url(url, *, query_update=None, **kwargs):
2997 """Replace URL components specified by kwargs
2998 @param url str or parse url tuple
2999 @param query_update update query
3000 @returns str
3001 """
3002 if isinstance(url, str):
3003 if not kwargs and not query_update:
3004 return url
3005 else:
3006 url = urllib.parse.urlparse(url)
3007 if query_update:
3008 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3009 kwargs['query'] = urllib.parse.urlencode({
3010 **urllib.parse.parse_qs(url.query),
3011 **query_update
3012 }, True)
3013 return urllib.parse.urlunparse(url._replace(**kwargs))
3014
3015
3016 def update_url_query(url, query):
3017 return update_url(url, query_update=query)
3018
3019
3020 def update_Request(req, url=None, data=None, headers=None, query=None):
3021 req_headers = req.headers.copy()
3022 req_headers.update(headers or {})
3023 req_data = data or req.data
3024 req_url = update_url_query(url or req.get_full_url(), query)
3025 req_get_method = req.get_method()
3026 if req_get_method == 'HEAD':
3027 req_type = HEADRequest
3028 elif req_get_method == 'PUT':
3029 req_type = PUTRequest
3030 else:
3031 req_type = urllib.request.Request
3032 new_req = req_type(
3033 req_url, data=req_data, headers=req_headers,
3034 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3035 if hasattr(req, 'timeout'):
3036 new_req.timeout = req.timeout
3037 return new_req
3038
3039
3040 def _multipart_encode_impl(data, boundary):
3041 content_type = 'multipart/form-data; boundary=%s' % boundary
3042
3043 out = b''
3044 for k, v in data.items():
3045 out += b'--' + boundary.encode('ascii') + b'\r\n'
3046 if isinstance(k, str):
3047 k = k.encode()
3048 if isinstance(v, str):
3049 v = v.encode()
3050 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3051 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3052 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3053 if boundary.encode('ascii') in content:
3054 raise ValueError('Boundary overlaps with data')
3055 out += content
3056
3057 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3058
3059 return out, content_type
3060
3061
3062 def multipart_encode(data, boundary=None):
3063 '''
3064 Encode a dict to RFC 7578-compliant form-data
3065
3066 data:
3067 A dict where keys and values can be either Unicode or bytes-like
3068 objects.
3069 boundary:
3070 If specified a Unicode object, it's used as the boundary. Otherwise
3071 a random boundary is generated.
3072
3073 Reference: https://tools.ietf.org/html/rfc7578
3074 '''
3075 has_specified_boundary = boundary is not None
3076
3077 while True:
3078 if boundary is None:
3079 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3080
3081 try:
3082 out, content_type = _multipart_encode_impl(data, boundary)
3083 break
3084 except ValueError:
3085 if has_specified_boundary:
3086 raise
3087 boundary = None
3088
3089 return out, content_type
3090
3091
3092 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3093 if blocked_types is NO_DEFAULT:
3094 blocked_types = (str, bytes, collections.abc.Mapping)
3095 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3096
3097
3098 def variadic(x, allowed_types=NO_DEFAULT):
3099 if not isinstance(allowed_types, (tuple, type)):
3100 deprecation_warning('allowed_types should be a tuple or a type')
3101 allowed_types = tuple(allowed_types)
3102 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3103
3104
3105 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3106 for f in funcs:
3107 try:
3108 val = f(*args, **kwargs)
3109 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3110 pass
3111 else:
3112 if expected_type is None or isinstance(val, expected_type):
3113 return val
3114
3115
3116 def try_get(src, getter, expected_type=None):
3117 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3118
3119
3120 def filter_dict(dct, cndn=lambda _, v: v is not None):
3121 return {k: v for k, v in dct.items() if cndn(k, v)}
3122
3123
3124 def merge_dicts(*dicts):
3125 merged = {}
3126 for a_dict in dicts:
3127 for k, v in a_dict.items():
3128 if (v is not None and k not in merged
3129 or isinstance(v, str) and merged[k] == ''):
3130 merged[k] = v
3131 return merged
3132
3133
3134 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3135 return string if isinstance(string, str) else str(string, encoding, errors)
3136
3137
3138 US_RATINGS = {
3139 'G': 0,
3140 'PG': 10,
3141 'PG-13': 13,
3142 'R': 16,
3143 'NC': 18,
3144 }
3145
3146
3147 TV_PARENTAL_GUIDELINES = {
3148 'TV-Y': 0,
3149 'TV-Y7': 7,
3150 'TV-G': 0,
3151 'TV-PG': 0,
3152 'TV-14': 14,
3153 'TV-MA': 17,
3154 }
3155
3156
3157 def parse_age_limit(s):
3158 # isinstance(False, int) is True. So type() must be used instead
3159 if type(s) is int: # noqa: E721
3160 return s if 0 <= s <= 21 else None
3161 elif not isinstance(s, str):
3162 return None
3163 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3164 if m:
3165 return int(m.group('age'))
3166 s = s.upper()
3167 if s in US_RATINGS:
3168 return US_RATINGS[s]
3169 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3170 if m:
3171 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3172 return None
3173
3174
3175 def strip_jsonp(code):
3176 return re.sub(
3177 r'''(?sx)^
3178 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3179 (?:\s*&&\s*(?P=func_name))?
3180 \s*\(\s*(?P<callback_data>.*)\);?
3181 \s*?(?://[^\n]*)*$''',
3182 r'\g<callback_data>', code)
3183
3184
3185 def js_to_json(code, vars={}, *, strict=False):
3186 # vars is a dict of var, val pairs to substitute
3187 STRING_QUOTES = '\'"`'
3188 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3189 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3190 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3191 INTEGER_TABLE = (
3192 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3193 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3194 )
3195
3196 def process_escape(match):
3197 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3198 escape = match.group(1) or match.group(2)
3199
3200 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3201 else R'\u00' if escape == 'x'
3202 else '' if escape == '\n'
3203 else escape)
3204
3205 def template_substitute(match):
3206 evaluated = js_to_json(match.group(1), vars, strict=strict)
3207 if evaluated[0] == '"':
3208 return json.loads(evaluated)
3209 return evaluated
3210
3211 def fix_kv(m):
3212 v = m.group(0)
3213 if v in ('true', 'false', 'null'):
3214 return v
3215 elif v in ('undefined', 'void 0'):
3216 return 'null'
3217 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3218 return ''
3219
3220 if v[0] in STRING_QUOTES:
3221 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3222 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3223 return f'"{escaped}"'
3224
3225 for regex, base in INTEGER_TABLE:
3226 im = re.match(regex, v)
3227 if im:
3228 i = int(im.group(1), base)
3229 return f'"{i}":' if v.endswith(':') else str(i)
3230
3231 if v in vars:
3232 try:
3233 if not strict:
3234 json.loads(vars[v])
3235 except json.JSONDecodeError:
3236 return json.dumps(vars[v])
3237 else:
3238 return vars[v]
3239
3240 if not strict:
3241 return f'"{v}"'
3242
3243 raise ValueError(f'Unknown value: {v}')
3244
3245 def create_map(mobj):
3246 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3247
3248 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3249 if not strict:
3250 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3251 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3252 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3253 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3254
3255 return re.sub(rf'''(?sx)
3256 {STRING_RE}|
3257 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3258 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3259 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3260 [0-9]+(?={SKIP_RE}:)|
3261 !+
3262 ''', fix_kv, code)
3263
3264
3265 def qualities(quality_ids):
3266 """ Get a numeric quality value out of a list of possible values """
3267 def q(qid):
3268 try:
3269 return quality_ids.index(qid)
3270 except ValueError:
3271 return -1
3272 return q
3273
3274
3275 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3276
3277
3278 DEFAULT_OUTTMPL = {
3279 'default': '%(title)s [%(id)s].%(ext)s',
3280 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3281 }
3282 OUTTMPL_TYPES = {
3283 'chapter': None,
3284 'subtitle': None,
3285 'thumbnail': None,
3286 'description': 'description',
3287 'annotation': 'annotations.xml',
3288 'infojson': 'info.json',
3289 'link': None,
3290 'pl_video': None,
3291 'pl_thumbnail': None,
3292 'pl_description': 'description',
3293 'pl_infojson': 'info.json',
3294 }
3295
3296 # As of [1] format syntax is:
3297 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3298 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3299 STR_FORMAT_RE_TMPL = r'''(?x)
3300 (?<!%)(?P<prefix>(?:%%)*)
3301 %
3302 (?P<has_key>\((?P<key>{0})\))?
3303 (?P<format>
3304 (?P<conversion>[#0\-+ ]+)?
3305 (?P<min_width>\d+)?
3306 (?P<precision>\.\d+)?
3307 (?P<len_mod>[hlL])? # unused in python
3308 {1} # conversion type
3309 )
3310 '''
3311
3312
3313 STR_FORMAT_TYPES = 'diouxXeEfFgGcrsa'
3314
3315
3316 def limit_length(s, length):
3317 """ Add ellipses to overly long strings """
3318 if s is None:
3319 return None
3320 ELLIPSES = '...'
3321 if len(s) > length:
3322 return s[:length - len(ELLIPSES)] + ELLIPSES
3323 return s
3324
3325
3326 def version_tuple(v):
3327 return tuple(int(e) for e in re.split(r'[-.]', v))
3328
3329
3330 def is_outdated_version(version, limit, assume_new=True):
3331 if not version:
3332 return not assume_new
3333 try:
3334 return version_tuple(version) < version_tuple(limit)
3335 except ValueError:
3336 return not assume_new
3337
3338
3339 def ytdl_is_updateable():
3340 """ Returns if yt-dlp can be updated with -U """
3341
3342 from ..update import is_non_updateable
3343
3344 return not is_non_updateable()
3345
3346
3347 def args_to_str(args):
3348 # Get a short string representation for a subprocess command
3349 return ' '.join(compat_shlex_quote(a) for a in args)
3350
3351
3352 def error_to_str(err):
3353 return f'{type(err).__name__}: {err}'
3354
3355
3356 def mimetype2ext(mt, default=NO_DEFAULT):
3357 if not isinstance(mt, str):
3358 if default is not NO_DEFAULT:
3359 return default
3360 return None
3361
3362 MAP = {
3363 # video
3364 '3gpp': '3gp',
3365 'mp2t': 'ts',
3366 'mp4': 'mp4',
3367 'mpeg': 'mpeg',
3368 'mpegurl': 'm3u8',
3369 'quicktime': 'mov',
3370 'webm': 'webm',
3371 'vp9': 'vp9',
3372 'x-flv': 'flv',
3373 'x-m4v': 'm4v',
3374 'x-matroska': 'mkv',
3375 'x-mng': 'mng',
3376 'x-mp4-fragmented': 'mp4',
3377 'x-ms-asf': 'asf',
3378 'x-ms-wmv': 'wmv',
3379 'x-msvideo': 'avi',
3380
3381 # application (streaming playlists)
3382 'dash+xml': 'mpd',
3383 'f4m+xml': 'f4m',
3384 'hds+xml': 'f4m',
3385 'vnd.apple.mpegurl': 'm3u8',
3386 'vnd.ms-sstr+xml': 'ism',
3387 'x-mpegurl': 'm3u8',
3388
3389 # audio
3390 'audio/mp4': 'm4a',
3391 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3392 # Using .mp3 as it's the most popular one
3393 'audio/mpeg': 'mp3',
3394 'audio/webm': 'webm',
3395 'audio/x-matroska': 'mka',
3396 'audio/x-mpegurl': 'm3u',
3397 'midi': 'mid',
3398 'ogg': 'ogg',
3399 'wav': 'wav',
3400 'wave': 'wav',
3401 'x-aac': 'aac',
3402 'x-flac': 'flac',
3403 'x-m4a': 'm4a',
3404 'x-realaudio': 'ra',
3405 'x-wav': 'wav',
3406
3407 # image
3408 'avif': 'avif',
3409 'bmp': 'bmp',
3410 'gif': 'gif',
3411 'jpeg': 'jpg',
3412 'png': 'png',
3413 'svg+xml': 'svg',
3414 'tiff': 'tif',
3415 'vnd.wap.wbmp': 'wbmp',
3416 'webp': 'webp',
3417 'x-icon': 'ico',
3418 'x-jng': 'jng',
3419 'x-ms-bmp': 'bmp',
3420
3421 # caption
3422 'filmstrip+json': 'fs',
3423 'smptett+xml': 'tt',
3424 'ttaf+xml': 'dfxp',
3425 'ttml+xml': 'ttml',
3426 'x-ms-sami': 'sami',
3427
3428 # misc
3429 'gzip': 'gz',
3430 'json': 'json',
3431 'xml': 'xml',
3432 'zip': 'zip',
3433 }
3434
3435 mimetype = mt.partition(';')[0].strip().lower()
3436 _, _, subtype = mimetype.rpartition('/')
3437
3438 ext = traversal.traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3439 if ext:
3440 return ext
3441 elif default is not NO_DEFAULT:
3442 return default
3443 return subtype.replace('+', '.')
3444
3445
3446 def ext2mimetype(ext_or_url):
3447 if not ext_or_url:
3448 return None
3449 if '.' not in ext_or_url:
3450 ext_or_url = f'file.{ext_or_url}'
3451 return mimetypes.guess_type(ext_or_url)[0]
3452
3453
3454 def parse_codecs(codecs_str):
3455 # http://tools.ietf.org/html/rfc6381
3456 if not codecs_str:
3457 return {}
3458 split_codecs = list(filter(None, map(
3459 str.strip, codecs_str.strip().strip(',').split(','))))
3460 vcodec, acodec, scodec, hdr = None, None, None, None
3461 for full_codec in split_codecs:
3462 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3463 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3464 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3465 if vcodec:
3466 continue
3467 vcodec = full_codec
3468 if parts[0] in ('dvh1', 'dvhe'):
3469 hdr = 'DV'
3470 elif parts[0] == 'av1' and traversal.traverse_obj(parts, 3) == '10':
3471 hdr = 'HDR10'
3472 elif parts[:2] == ['vp9', '2']:
3473 hdr = 'HDR10'
3474 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3475 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3476 acodec = acodec or full_codec
3477 elif parts[0] in ('stpp', 'wvtt'):
3478 scodec = scodec or full_codec
3479 else:
3480 write_string(f'WARNING: Unknown codec {full_codec}\n')
3481 if vcodec or acodec or scodec:
3482 return {
3483 'vcodec': vcodec or 'none',
3484 'acodec': acodec or 'none',
3485 'dynamic_range': hdr,
3486 **({'scodec': scodec} if scodec is not None else {}),
3487 }
3488 elif len(split_codecs) == 2:
3489 return {
3490 'vcodec': split_codecs[0],
3491 'acodec': split_codecs[1],
3492 }
3493 return {}
3494
3495
3496 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3497 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3498
3499 allow_mkv = not preferences or 'mkv' in preferences
3500
3501 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3502 return 'mkv' # TODO: any other format allows this?
3503
3504 # TODO: All codecs supported by parse_codecs isn't handled here
3505 COMPATIBLE_CODECS = {
3506 'mp4': {
3507 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3508 'h264', 'aacl', 'ec-3', # Set in ISM
3509 },
3510 'webm': {
3511 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3512 'vp9x', 'vp8x', # in the webm spec
3513 },
3514 }
3515
3516 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3517 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3518
3519 for ext in preferences or COMPATIBLE_CODECS.keys():
3520 codec_set = COMPATIBLE_CODECS.get(ext, set())
3521 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3522 return ext
3523
3524 COMPATIBLE_EXTS = (
3525 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3526 {'webm', 'weba'},
3527 )
3528 for ext in preferences or vexts:
3529 current_exts = {ext, *vexts, *aexts}
3530 if ext == 'mkv' or current_exts == {ext} or any(
3531 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3532 return ext
3533 return 'mkv' if allow_mkv else preferences[-1]
3534
3535
3536 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3537 getheader = url_handle.headers.get
3538
3539 cd = getheader('Content-Disposition')
3540 if cd:
3541 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3542 if m:
3543 e = determine_ext(m.group('filename'), default_ext=None)
3544 if e:
3545 return e
3546
3547 meta_ext = getheader('x-amz-meta-name')
3548 if meta_ext:
3549 e = meta_ext.rpartition('.')[2]
3550 if e:
3551 return e
3552
3553 return mimetype2ext(getheader('Content-Type'), default=default)
3554
3555
3556 def encode_data_uri(data, mime_type):
3557 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3558
3559
3560 def age_restricted(content_limit, age_limit):
3561 """ Returns True iff the content should be blocked """
3562
3563 if age_limit is None: # No limit set
3564 return False
3565 if content_limit is None:
3566 return False # Content available for everyone
3567 return age_limit < content_limit
3568
3569
3570 # List of known byte-order-marks (BOM)
3571 BOMS = [
3572 (b'\xef\xbb\xbf', 'utf-8'),
3573 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3574 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3575 (b'\xff\xfe', 'utf-16-le'),
3576 (b'\xfe\xff', 'utf-16-be'),
3577 ]
3578
3579
3580 def is_html(first_bytes):
3581 """ Detect whether a file contains HTML by examining its first bytes. """
3582
3583 encoding = 'utf-8'
3584 for bom, enc in BOMS:
3585 while first_bytes.startswith(bom):
3586 encoding, first_bytes = enc, first_bytes[len(bom):]
3587
3588 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3589
3590
3591 def determine_protocol(info_dict):
3592 protocol = info_dict.get('protocol')
3593 if protocol is not None:
3594 return protocol
3595
3596 url = sanitize_url(info_dict['url'])
3597 if url.startswith('rtmp'):
3598 return 'rtmp'
3599 elif url.startswith('mms'):
3600 return 'mms'
3601 elif url.startswith('rtsp'):
3602 return 'rtsp'
3603
3604 ext = determine_ext(url)
3605 if ext == 'm3u8':
3606 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3607 elif ext == 'f4m':
3608 return 'f4m'
3609
3610 return urllib.parse.urlparse(url).scheme
3611
3612
3613 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3614 """ Render a list of rows, each as a list of values.
3615 Text after a \t will be right aligned """
3616 def width(string):
3617 return len(remove_terminal_sequences(string).replace('\t', ''))
3618
3619 def get_max_lens(table):
3620 return [max(width(str(v)) for v in col) for col in zip(*table)]
3621
3622 def filter_using_list(row, filterArray):
3623 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3624
3625 max_lens = get_max_lens(data) if hide_empty else []
3626 header_row = filter_using_list(header_row, max_lens)
3627 data = [filter_using_list(row, max_lens) for row in data]
3628
3629 table = [header_row] + data
3630 max_lens = get_max_lens(table)
3631 extra_gap += 1
3632 if delim:
3633 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3634 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3635 for row in table:
3636 for pos, text in enumerate(map(str, row)):
3637 if '\t' in text:
3638 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3639 else:
3640 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3641 ret = '\n'.join(''.join(row).rstrip() for row in table)
3642 return ret
3643
3644
3645 def _match_one(filter_part, dct, incomplete):
3646 # TODO: Generalize code with YoutubeDL._build_format_filter
3647 STRING_OPERATORS = {
3648 '*=': operator.contains,
3649 '^=': lambda attr, value: attr.startswith(value),
3650 '$=': lambda attr, value: attr.endswith(value),
3651 '~=': lambda attr, value: re.search(value, attr),
3652 }
3653 COMPARISON_OPERATORS = {
3654 **STRING_OPERATORS,
3655 '<=': operator.le, # "<=" must be defined above "<"
3656 '<': operator.lt,
3657 '>=': operator.ge,
3658 '>': operator.gt,
3659 '=': operator.eq,
3660 }
3661
3662 if isinstance(incomplete, bool):
3663 is_incomplete = lambda _: incomplete
3664 else:
3665 is_incomplete = lambda k: k in incomplete
3666
3667 operator_rex = re.compile(r'''(?x)
3668 (?P<key>[a-z_]+)
3669 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3670 (?:
3671 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3672 (?P<strval>.+?)
3673 )
3674 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3675 m = operator_rex.fullmatch(filter_part.strip())
3676 if m:
3677 m = m.groupdict()
3678 unnegated_op = COMPARISON_OPERATORS[m['op']]
3679 if m['negation']:
3680 op = lambda attr, value: not unnegated_op(attr, value)
3681 else:
3682 op = unnegated_op
3683 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3684 if m['quote']:
3685 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3686 actual_value = dct.get(m['key'])
3687 numeric_comparison = None
3688 if isinstance(actual_value, (int, float)):
3689 # If the original field is a string and matching comparisonvalue is
3690 # a number we should respect the origin of the original field
3691 # and process comparison value as a string (see
3692 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3693 try:
3694 numeric_comparison = int(comparison_value)
3695 except ValueError:
3696 numeric_comparison = parse_filesize(comparison_value)
3697 if numeric_comparison is None:
3698 numeric_comparison = parse_filesize(f'{comparison_value}B')
3699 if numeric_comparison is None:
3700 numeric_comparison = parse_duration(comparison_value)
3701 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3702 raise ValueError('Operator %s only supports string values!' % m['op'])
3703 if actual_value is None:
3704 return is_incomplete(m['key']) or m['none_inclusive']
3705 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3706
3707 UNARY_OPERATORS = {
3708 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3709 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3710 }
3711 operator_rex = re.compile(r'''(?x)
3712 (?P<op>%s)\s*(?P<key>[a-z_]+)
3713 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3714 m = operator_rex.fullmatch(filter_part.strip())
3715 if m:
3716 op = UNARY_OPERATORS[m.group('op')]
3717 actual_value = dct.get(m.group('key'))
3718 if is_incomplete(m.group('key')) and actual_value is None:
3719 return True
3720 return op(actual_value)
3721
3722 raise ValueError('Invalid filter part %r' % filter_part)
3723
3724
3725 def match_str(filter_str, dct, incomplete=False):
3726 """ Filter a dictionary with a simple string syntax.
3727 @returns Whether the filter passes
3728 @param incomplete Set of keys that is expected to be missing from dct.
3729 Can be True/False to indicate all/none of the keys may be missing.
3730 All conditions on incomplete keys pass if the key is missing
3731 """
3732 return all(
3733 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3734 for filter_part in re.split(r'(?<!\\)&', filter_str))
3735
3736
3737 def match_filter_func(filters, breaking_filters=None):
3738 if not filters and not breaking_filters:
3739 return None
3740 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3741 filters = set(variadic(filters or []))
3742
3743 interactive = '-' in filters
3744 if interactive:
3745 filters.remove('-')
3746
3747 def _match_func(info_dict, incomplete=False):
3748 ret = breaking_filters(info_dict, incomplete)
3749 if ret is not None:
3750 raise RejectedVideoReached(ret)
3751
3752 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3753 return NO_DEFAULT if interactive and not incomplete else None
3754 else:
3755 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3756 filter_str = ') | ('.join(map(str.strip, filters))
3757 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3758 return _match_func
3759
3760
3761 class download_range_func:
3762 def __init__(self, chapters, ranges):
3763 self.chapters, self.ranges = chapters, ranges
3764
3765 def __call__(self, info_dict, ydl):
3766 if not self.ranges and not self.chapters:
3767 yield {}
3768
3769 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3770 else 'Cannot match chapters since chapter information is unavailable')
3771 for regex in self.chapters or []:
3772 for i, chapter in enumerate(info_dict.get('chapters') or []):
3773 if re.search(regex, chapter['title']):
3774 warning = None
3775 yield {**chapter, 'index': i}
3776 if self.chapters and warning:
3777 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3778
3779 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3780
3781 def __eq__(self, other):
3782 return (isinstance(other, download_range_func)
3783 and self.chapters == other.chapters and self.ranges == other.ranges)
3784
3785 def __repr__(self):
3786 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3787
3788
3789 def parse_dfxp_time_expr(time_expr):
3790 if not time_expr:
3791 return
3792
3793 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3794 if mobj:
3795 return float(mobj.group('time_offset'))
3796
3797 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3798 if mobj:
3799 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3800
3801
3802 def srt_subtitles_timecode(seconds):
3803 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3804
3805
3806 def ass_subtitles_timecode(seconds):
3807 time = timetuple_from_msec(seconds * 1000)
3808 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3809
3810
3811 def dfxp2srt(dfxp_data):
3812 '''
3813 @param dfxp_data A bytes-like object containing DFXP data
3814 @returns A unicode object containing converted SRT data
3815 '''
3816 LEGACY_NAMESPACES = (
3817 (b'http://www.w3.org/ns/ttml', [
3818 b'http://www.w3.org/2004/11/ttaf1',
3819 b'http://www.w3.org/2006/04/ttaf1',
3820 b'http://www.w3.org/2006/10/ttaf1',
3821 ]),
3822 (b'http://www.w3.org/ns/ttml#styling', [
3823 b'http://www.w3.org/ns/ttml#style',
3824 ]),
3825 )
3826
3827 SUPPORTED_STYLING = [
3828 'color',
3829 'fontFamily',
3830 'fontSize',
3831 'fontStyle',
3832 'fontWeight',
3833 'textDecoration'
3834 ]
3835
3836 _x = functools.partial(xpath_with_ns, ns_map={
3837 'xml': 'http://www.w3.org/XML/1998/namespace',
3838 'ttml': 'http://www.w3.org/ns/ttml',
3839 'tts': 'http://www.w3.org/ns/ttml#styling',
3840 })
3841
3842 styles = {}
3843 default_style = {}
3844
3845 class TTMLPElementParser:
3846 _out = ''
3847 _unclosed_elements = []
3848 _applied_styles = []
3849
3850 def start(self, tag, attrib):
3851 if tag in (_x('ttml:br'), 'br'):
3852 self._out += '\n'
3853 else:
3854 unclosed_elements = []
3855 style = {}
3856 element_style_id = attrib.get('style')
3857 if default_style:
3858 style.update(default_style)
3859 if element_style_id:
3860 style.update(styles.get(element_style_id, {}))
3861 for prop in SUPPORTED_STYLING:
3862 prop_val = attrib.get(_x('tts:' + prop))
3863 if prop_val:
3864 style[prop] = prop_val
3865 if style:
3866 font = ''
3867 for k, v in sorted(style.items()):
3868 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3869 continue
3870 if k == 'color':
3871 font += ' color="%s"' % v
3872 elif k == 'fontSize':
3873 font += ' size="%s"' % v
3874 elif k == 'fontFamily':
3875 font += ' face="%s"' % v
3876 elif k == 'fontWeight' and v == 'bold':
3877 self._out += '<b>'
3878 unclosed_elements.append('b')
3879 elif k == 'fontStyle' and v == 'italic':
3880 self._out += '<i>'
3881 unclosed_elements.append('i')
3882 elif k == 'textDecoration' and v == 'underline':
3883 self._out += '<u>'
3884 unclosed_elements.append('u')
3885 if font:
3886 self._out += '<font' + font + '>'
3887 unclosed_elements.append('font')
3888 applied_style = {}
3889 if self._applied_styles:
3890 applied_style.update(self._applied_styles[-1])
3891 applied_style.update(style)
3892 self._applied_styles.append(applied_style)
3893 self._unclosed_elements.append(unclosed_elements)
3894
3895 def end(self, tag):
3896 if tag not in (_x('ttml:br'), 'br'):
3897 unclosed_elements = self._unclosed_elements.pop()
3898 for element in reversed(unclosed_elements):
3899 self._out += '</%s>' % element
3900 if unclosed_elements and self._applied_styles:
3901 self._applied_styles.pop()
3902
3903 def data(self, data):
3904 self._out += data
3905
3906 def close(self):
3907 return self._out.strip()
3908
3909 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
3910 # This will not trigger false positives since only UTF-8 text is being replaced
3911 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
3912
3913 def parse_node(node):
3914 target = TTMLPElementParser()
3915 parser = xml.etree.ElementTree.XMLParser(target=target)
3916 parser.feed(xml.etree.ElementTree.tostring(node))
3917 return parser.close()
3918
3919 for k, v in LEGACY_NAMESPACES:
3920 for ns in v:
3921 dfxp_data = dfxp_data.replace(ns, k)
3922
3923 dfxp = compat_etree_fromstring(dfxp_data)
3924 out = []
3925 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3926
3927 if not paras:
3928 raise ValueError('Invalid dfxp/TTML subtitle')
3929
3930 repeat = False
3931 while True:
3932 for style in dfxp.findall(_x('.//ttml:style')):
3933 style_id = style.get('id') or style.get(_x('xml:id'))
3934 if not style_id:
3935 continue
3936 parent_style_id = style.get('style')
3937 if parent_style_id:
3938 if parent_style_id not in styles:
3939 repeat = True
3940 continue
3941 styles[style_id] = styles[parent_style_id].copy()
3942 for prop in SUPPORTED_STYLING:
3943 prop_val = style.get(_x('tts:' + prop))
3944 if prop_val:
3945 styles.setdefault(style_id, {})[prop] = prop_val
3946 if repeat:
3947 repeat = False
3948 else:
3949 break
3950
3951 for p in ('body', 'div'):
3952 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3953 if ele is None:
3954 continue
3955 style = styles.get(ele.get('style'))
3956 if not style:
3957 continue
3958 default_style.update(style)
3959
3960 for para, index in zip(paras, itertools.count(1)):
3961 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3962 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3963 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3964 if begin_time is None:
3965 continue
3966 if not end_time:
3967 if not dur:
3968 continue
3969 end_time = begin_time + dur
3970 out.append('%d\n%s --> %s\n%s\n\n' % (
3971 index,
3972 srt_subtitles_timecode(begin_time),
3973 srt_subtitles_timecode(end_time),
3974 parse_node(para)))
3975
3976 return ''.join(out)
3977
3978
3979 def cli_option(params, command_option, param, separator=None):
3980 param = params.get(param)
3981 return ([] if param is None
3982 else [command_option, str(param)] if separator is None
3983 else [f'{command_option}{separator}{param}'])
3984
3985
3986 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3987 param = params.get(param)
3988 assert param in (True, False, None)
3989 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3990
3991
3992 def cli_valueless_option(params, command_option, param, expected_value=True):
3993 return [command_option] if params.get(param) == expected_value else []
3994
3995
3996 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3997 if isinstance(argdict, (list, tuple)): # for backward compatibility
3998 if use_compat:
3999 return argdict
4000 else:
4001 argdict = None
4002 if argdict is None:
4003 return default
4004 assert isinstance(argdict, dict)
4005
4006 assert isinstance(keys, (list, tuple))
4007 for key_list in keys:
4008 arg_list = list(filter(
4009 lambda x: x is not None,
4010 [argdict.get(key.lower()) for key in variadic(key_list)]))
4011 if arg_list:
4012 return [arg for args in arg_list for arg in args]
4013 return default
4014
4015
4016 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4017 main_key, exe = main_key.lower(), exe.lower()
4018 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4019 keys = [f'{root_key}{k}' for k in (keys or [''])]
4020 if root_key in keys:
4021 if main_key != exe:
4022 keys.append((main_key, exe))
4023 keys.append('default')
4024 else:
4025 use_compat = False
4026 return cli_configuration_args(argdict, keys, default, use_compat)
4027
4028
4029 class ISO639Utils:
4030 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4031 _lang_map = {
4032 'aa': 'aar',
4033 'ab': 'abk',
4034 'ae': 'ave',
4035 'af': 'afr',
4036 'ak': 'aka',
4037 'am': 'amh',
4038 'an': 'arg',
4039 'ar': 'ara',
4040 'as': 'asm',
4041 'av': 'ava',
4042 'ay': 'aym',
4043 'az': 'aze',
4044 'ba': 'bak',
4045 'be': 'bel',
4046 'bg': 'bul',
4047 'bh': 'bih',
4048 'bi': 'bis',
4049 'bm': 'bam',
4050 'bn': 'ben',
4051 'bo': 'bod',
4052 'br': 'bre',
4053 'bs': 'bos',
4054 'ca': 'cat',
4055 'ce': 'che',
4056 'ch': 'cha',
4057 'co': 'cos',
4058 'cr': 'cre',
4059 'cs': 'ces',
4060 'cu': 'chu',
4061 'cv': 'chv',
4062 'cy': 'cym',
4063 'da': 'dan',
4064 'de': 'deu',
4065 'dv': 'div',
4066 'dz': 'dzo',
4067 'ee': 'ewe',
4068 'el': 'ell',
4069 'en': 'eng',
4070 'eo': 'epo',
4071 'es': 'spa',
4072 'et': 'est',
4073 'eu': 'eus',
4074 'fa': 'fas',
4075 'ff': 'ful',
4076 'fi': 'fin',
4077 'fj': 'fij',
4078 'fo': 'fao',
4079 'fr': 'fra',
4080 'fy': 'fry',
4081 'ga': 'gle',
4082 'gd': 'gla',
4083 'gl': 'glg',
4084 'gn': 'grn',
4085 'gu': 'guj',
4086 'gv': 'glv',
4087 'ha': 'hau',
4088 'he': 'heb',
4089 'iw': 'heb', # Replaced by he in 1989 revision
4090 'hi': 'hin',
4091 'ho': 'hmo',
4092 'hr': 'hrv',
4093 'ht': 'hat',
4094 'hu': 'hun',
4095 'hy': 'hye',
4096 'hz': 'her',
4097 'ia': 'ina',
4098 'id': 'ind',
4099 'in': 'ind', # Replaced by id in 1989 revision
4100 'ie': 'ile',
4101 'ig': 'ibo',
4102 'ii': 'iii',
4103 'ik': 'ipk',
4104 'io': 'ido',
4105 'is': 'isl',
4106 'it': 'ita',
4107 'iu': 'iku',
4108 'ja': 'jpn',
4109 'jv': 'jav',
4110 'ka': 'kat',
4111 'kg': 'kon',
4112 'ki': 'kik',
4113 'kj': 'kua',
4114 'kk': 'kaz',
4115 'kl': 'kal',
4116 'km': 'khm',
4117 'kn': 'kan',
4118 'ko': 'kor',
4119 'kr': 'kau',
4120 'ks': 'kas',
4121 'ku': 'kur',
4122 'kv': 'kom',
4123 'kw': 'cor',
4124 'ky': 'kir',
4125 'la': 'lat',
4126 'lb': 'ltz',
4127 'lg': 'lug',
4128 'li': 'lim',
4129 'ln': 'lin',
4130 'lo': 'lao',
4131 'lt': 'lit',
4132 'lu': 'lub',
4133 'lv': 'lav',
4134 'mg': 'mlg',
4135 'mh': 'mah',
4136 'mi': 'mri',
4137 'mk': 'mkd',
4138 'ml': 'mal',
4139 'mn': 'mon',
4140 'mr': 'mar',
4141 'ms': 'msa',
4142 'mt': 'mlt',
4143 'my': 'mya',
4144 'na': 'nau',
4145 'nb': 'nob',
4146 'nd': 'nde',
4147 'ne': 'nep',
4148 'ng': 'ndo',
4149 'nl': 'nld',
4150 'nn': 'nno',
4151 'no': 'nor',
4152 'nr': 'nbl',
4153 'nv': 'nav',
4154 'ny': 'nya',
4155 'oc': 'oci',
4156 'oj': 'oji',
4157 'om': 'orm',
4158 'or': 'ori',
4159 'os': 'oss',
4160 'pa': 'pan',
4161 'pe': 'per',
4162 'pi': 'pli',
4163 'pl': 'pol',
4164 'ps': 'pus',
4165 'pt': 'por',
4166 'qu': 'que',
4167 'rm': 'roh',
4168 'rn': 'run',
4169 'ro': 'ron',
4170 'ru': 'rus',
4171 'rw': 'kin',
4172 'sa': 'san',
4173 'sc': 'srd',
4174 'sd': 'snd',
4175 'se': 'sme',
4176 'sg': 'sag',
4177 'si': 'sin',
4178 'sk': 'slk',
4179 'sl': 'slv',
4180 'sm': 'smo',
4181 'sn': 'sna',
4182 'so': 'som',
4183 'sq': 'sqi',
4184 'sr': 'srp',
4185 'ss': 'ssw',
4186 'st': 'sot',
4187 'su': 'sun',
4188 'sv': 'swe',
4189 'sw': 'swa',
4190 'ta': 'tam',
4191 'te': 'tel',
4192 'tg': 'tgk',
4193 'th': 'tha',
4194 'ti': 'tir',
4195 'tk': 'tuk',
4196 'tl': 'tgl',
4197 'tn': 'tsn',
4198 'to': 'ton',
4199 'tr': 'tur',
4200 'ts': 'tso',
4201 'tt': 'tat',
4202 'tw': 'twi',
4203 'ty': 'tah',
4204 'ug': 'uig',
4205 'uk': 'ukr',
4206 'ur': 'urd',
4207 'uz': 'uzb',
4208 've': 'ven',
4209 'vi': 'vie',
4210 'vo': 'vol',
4211 'wa': 'wln',
4212 'wo': 'wol',
4213 'xh': 'xho',
4214 'yi': 'yid',
4215 'ji': 'yid', # Replaced by yi in 1989 revision
4216 'yo': 'yor',
4217 'za': 'zha',
4218 'zh': 'zho',
4219 'zu': 'zul',
4220 }
4221
4222 @classmethod
4223 def short2long(cls, code):
4224 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4225 return cls._lang_map.get(code[:2])
4226
4227 @classmethod
4228 def long2short(cls, code):
4229 """Convert language code from ISO 639-2/T to ISO 639-1"""
4230 for short_name, long_name in cls._lang_map.items():
4231 if long_name == code:
4232 return short_name
4233
4234
4235 class ISO3166Utils:
4236 # From http://data.okfn.org/data/core/country-list
4237 _country_map = {
4238 'AF': 'Afghanistan',
4239 'AX': 'Åland Islands',
4240 'AL': 'Albania',
4241 'DZ': 'Algeria',
4242 'AS': 'American Samoa',
4243 'AD': 'Andorra',
4244 'AO': 'Angola',
4245 'AI': 'Anguilla',
4246 'AQ': 'Antarctica',
4247 'AG': 'Antigua and Barbuda',
4248 'AR': 'Argentina',
4249 'AM': 'Armenia',
4250 'AW': 'Aruba',
4251 'AU': 'Australia',
4252 'AT': 'Austria',
4253 'AZ': 'Azerbaijan',
4254 'BS': 'Bahamas',
4255 'BH': 'Bahrain',
4256 'BD': 'Bangladesh',
4257 'BB': 'Barbados',
4258 'BY': 'Belarus',
4259 'BE': 'Belgium',
4260 'BZ': 'Belize',
4261 'BJ': 'Benin',
4262 'BM': 'Bermuda',
4263 'BT': 'Bhutan',
4264 'BO': 'Bolivia, Plurinational State of',
4265 'BQ': 'Bonaire, Sint Eustatius and Saba',
4266 'BA': 'Bosnia and Herzegovina',
4267 'BW': 'Botswana',
4268 'BV': 'Bouvet Island',
4269 'BR': 'Brazil',
4270 'IO': 'British Indian Ocean Territory',
4271 'BN': 'Brunei Darussalam',
4272 'BG': 'Bulgaria',
4273 'BF': 'Burkina Faso',
4274 'BI': 'Burundi',
4275 'KH': 'Cambodia',
4276 'CM': 'Cameroon',
4277 'CA': 'Canada',
4278 'CV': 'Cape Verde',
4279 'KY': 'Cayman Islands',
4280 'CF': 'Central African Republic',
4281 'TD': 'Chad',
4282 'CL': 'Chile',
4283 'CN': 'China',
4284 'CX': 'Christmas Island',
4285 'CC': 'Cocos (Keeling) Islands',
4286 'CO': 'Colombia',
4287 'KM': 'Comoros',
4288 'CG': 'Congo',
4289 'CD': 'Congo, the Democratic Republic of the',
4290 'CK': 'Cook Islands',
4291 'CR': 'Costa Rica',
4292 'CI': 'Côte d\'Ivoire',
4293 'HR': 'Croatia',
4294 'CU': 'Cuba',
4295 'CW': 'Curaçao',
4296 'CY': 'Cyprus',
4297 'CZ': 'Czech Republic',
4298 'DK': 'Denmark',
4299 'DJ': 'Djibouti',
4300 'DM': 'Dominica',
4301 'DO': 'Dominican Republic',
4302 'EC': 'Ecuador',
4303 'EG': 'Egypt',
4304 'SV': 'El Salvador',
4305 'GQ': 'Equatorial Guinea',
4306 'ER': 'Eritrea',
4307 'EE': 'Estonia',
4308 'ET': 'Ethiopia',
4309 'FK': 'Falkland Islands (Malvinas)',
4310 'FO': 'Faroe Islands',
4311 'FJ': 'Fiji',
4312 'FI': 'Finland',
4313 'FR': 'France',
4314 'GF': 'French Guiana',
4315 'PF': 'French Polynesia',
4316 'TF': 'French Southern Territories',
4317 'GA': 'Gabon',
4318 'GM': 'Gambia',
4319 'GE': 'Georgia',
4320 'DE': 'Germany',
4321 'GH': 'Ghana',
4322 'GI': 'Gibraltar',
4323 'GR': 'Greece',
4324 'GL': 'Greenland',
4325 'GD': 'Grenada',
4326 'GP': 'Guadeloupe',
4327 'GU': 'Guam',
4328 'GT': 'Guatemala',
4329 'GG': 'Guernsey',
4330 'GN': 'Guinea',
4331 'GW': 'Guinea-Bissau',
4332 'GY': 'Guyana',
4333 'HT': 'Haiti',
4334 'HM': 'Heard Island and McDonald Islands',
4335 'VA': 'Holy See (Vatican City State)',
4336 'HN': 'Honduras',
4337 'HK': 'Hong Kong',
4338 'HU': 'Hungary',
4339 'IS': 'Iceland',
4340 'IN': 'India',
4341 'ID': 'Indonesia',
4342 'IR': 'Iran, Islamic Republic of',
4343 'IQ': 'Iraq',
4344 'IE': 'Ireland',
4345 'IM': 'Isle of Man',
4346 'IL': 'Israel',
4347 'IT': 'Italy',
4348 'JM': 'Jamaica',
4349 'JP': 'Japan',
4350 'JE': 'Jersey',
4351 'JO': 'Jordan',
4352 'KZ': 'Kazakhstan',
4353 'KE': 'Kenya',
4354 'KI': 'Kiribati',
4355 'KP': 'Korea, Democratic People\'s Republic of',
4356 'KR': 'Korea, Republic of',
4357 'KW': 'Kuwait',
4358 'KG': 'Kyrgyzstan',
4359 'LA': 'Lao People\'s Democratic Republic',
4360 'LV': 'Latvia',
4361 'LB': 'Lebanon',
4362 'LS': 'Lesotho',
4363 'LR': 'Liberia',
4364 'LY': 'Libya',
4365 'LI': 'Liechtenstein',
4366 'LT': 'Lithuania',
4367 'LU': 'Luxembourg',
4368 'MO': 'Macao',
4369 'MK': 'Macedonia, the Former Yugoslav Republic of',
4370 'MG': 'Madagascar',
4371 'MW': 'Malawi',
4372 'MY': 'Malaysia',
4373 'MV': 'Maldives',
4374 'ML': 'Mali',
4375 'MT': 'Malta',
4376 'MH': 'Marshall Islands',
4377 'MQ': 'Martinique',
4378 'MR': 'Mauritania',
4379 'MU': 'Mauritius',
4380 'YT': 'Mayotte',
4381 'MX': 'Mexico',
4382 'FM': 'Micronesia, Federated States of',
4383 'MD': 'Moldova, Republic of',
4384 'MC': 'Monaco',
4385 'MN': 'Mongolia',
4386 'ME': 'Montenegro',
4387 'MS': 'Montserrat',
4388 'MA': 'Morocco',
4389 'MZ': 'Mozambique',
4390 'MM': 'Myanmar',
4391 'NA': 'Namibia',
4392 'NR': 'Nauru',
4393 'NP': 'Nepal',
4394 'NL': 'Netherlands',
4395 'NC': 'New Caledonia',
4396 'NZ': 'New Zealand',
4397 'NI': 'Nicaragua',
4398 'NE': 'Niger',
4399 'NG': 'Nigeria',
4400 'NU': 'Niue',
4401 'NF': 'Norfolk Island',
4402 'MP': 'Northern Mariana Islands',
4403 'NO': 'Norway',
4404 'OM': 'Oman',
4405 'PK': 'Pakistan',
4406 'PW': 'Palau',
4407 'PS': 'Palestine, State of',
4408 'PA': 'Panama',
4409 'PG': 'Papua New Guinea',
4410 'PY': 'Paraguay',
4411 'PE': 'Peru',
4412 'PH': 'Philippines',
4413 'PN': 'Pitcairn',
4414 'PL': 'Poland',
4415 'PT': 'Portugal',
4416 'PR': 'Puerto Rico',
4417 'QA': 'Qatar',
4418 'RE': 'Réunion',
4419 'RO': 'Romania',
4420 'RU': 'Russian Federation',
4421 'RW': 'Rwanda',
4422 'BL': 'Saint Barthélemy',
4423 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4424 'KN': 'Saint Kitts and Nevis',
4425 'LC': 'Saint Lucia',
4426 'MF': 'Saint Martin (French part)',
4427 'PM': 'Saint Pierre and Miquelon',
4428 'VC': 'Saint Vincent and the Grenadines',
4429 'WS': 'Samoa',
4430 'SM': 'San Marino',
4431 'ST': 'Sao Tome and Principe',
4432 'SA': 'Saudi Arabia',
4433 'SN': 'Senegal',
4434 'RS': 'Serbia',
4435 'SC': 'Seychelles',
4436 'SL': 'Sierra Leone',
4437 'SG': 'Singapore',
4438 'SX': 'Sint Maarten (Dutch part)',
4439 'SK': 'Slovakia',
4440 'SI': 'Slovenia',
4441 'SB': 'Solomon Islands',
4442 'SO': 'Somalia',
4443 'ZA': 'South Africa',
4444 'GS': 'South Georgia and the South Sandwich Islands',
4445 'SS': 'South Sudan',
4446 'ES': 'Spain',
4447 'LK': 'Sri Lanka',
4448 'SD': 'Sudan',
4449 'SR': 'Suriname',
4450 'SJ': 'Svalbard and Jan Mayen',
4451 'SZ': 'Swaziland',
4452 'SE': 'Sweden',
4453 'CH': 'Switzerland',
4454 'SY': 'Syrian Arab Republic',
4455 'TW': 'Taiwan, Province of China',
4456 'TJ': 'Tajikistan',
4457 'TZ': 'Tanzania, United Republic of',
4458 'TH': 'Thailand',
4459 'TL': 'Timor-Leste',
4460 'TG': 'Togo',
4461 'TK': 'Tokelau',
4462 'TO': 'Tonga',
4463 'TT': 'Trinidad and Tobago',
4464 'TN': 'Tunisia',
4465 'TR': 'Turkey',
4466 'TM': 'Turkmenistan',
4467 'TC': 'Turks and Caicos Islands',
4468 'TV': 'Tuvalu',
4469 'UG': 'Uganda',
4470 'UA': 'Ukraine',
4471 'AE': 'United Arab Emirates',
4472 'GB': 'United Kingdom',
4473 'US': 'United States',
4474 'UM': 'United States Minor Outlying Islands',
4475 'UY': 'Uruguay',
4476 'UZ': 'Uzbekistan',
4477 'VU': 'Vanuatu',
4478 'VE': 'Venezuela, Bolivarian Republic of',
4479 'VN': 'Viet Nam',
4480 'VG': 'Virgin Islands, British',
4481 'VI': 'Virgin Islands, U.S.',
4482 'WF': 'Wallis and Futuna',
4483 'EH': 'Western Sahara',
4484 'YE': 'Yemen',
4485 'ZM': 'Zambia',
4486 'ZW': 'Zimbabwe',
4487 # Not ISO 3166 codes, but used for IP blocks
4488 'AP': 'Asia/Pacific Region',
4489 'EU': 'Europe',
4490 }
4491
4492 @classmethod
4493 def short2full(cls, code):
4494 """Convert an ISO 3166-2 country code to the corresponding full name"""
4495 return cls._country_map.get(code.upper())
4496
4497
4498 class GeoUtils:
4499 # Major IPv4 address blocks per country
4500 _country_ip_map = {
4501 'AD': '46.172.224.0/19',
4502 'AE': '94.200.0.0/13',
4503 'AF': '149.54.0.0/17',
4504 'AG': '209.59.64.0/18',
4505 'AI': '204.14.248.0/21',
4506 'AL': '46.99.0.0/16',
4507 'AM': '46.70.0.0/15',
4508 'AO': '105.168.0.0/13',
4509 'AP': '182.50.184.0/21',
4510 'AQ': '23.154.160.0/24',
4511 'AR': '181.0.0.0/12',
4512 'AS': '202.70.112.0/20',
4513 'AT': '77.116.0.0/14',
4514 'AU': '1.128.0.0/11',
4515 'AW': '181.41.0.0/18',
4516 'AX': '185.217.4.0/22',
4517 'AZ': '5.197.0.0/16',
4518 'BA': '31.176.128.0/17',
4519 'BB': '65.48.128.0/17',
4520 'BD': '114.130.0.0/16',
4521 'BE': '57.0.0.0/8',
4522 'BF': '102.178.0.0/15',
4523 'BG': '95.42.0.0/15',
4524 'BH': '37.131.0.0/17',
4525 'BI': '154.117.192.0/18',
4526 'BJ': '137.255.0.0/16',
4527 'BL': '185.212.72.0/23',
4528 'BM': '196.12.64.0/18',
4529 'BN': '156.31.0.0/16',
4530 'BO': '161.56.0.0/16',
4531 'BQ': '161.0.80.0/20',
4532 'BR': '191.128.0.0/12',
4533 'BS': '24.51.64.0/18',
4534 'BT': '119.2.96.0/19',
4535 'BW': '168.167.0.0/16',
4536 'BY': '178.120.0.0/13',
4537 'BZ': '179.42.192.0/18',
4538 'CA': '99.224.0.0/11',
4539 'CD': '41.243.0.0/16',
4540 'CF': '197.242.176.0/21',
4541 'CG': '160.113.0.0/16',
4542 'CH': '85.0.0.0/13',
4543 'CI': '102.136.0.0/14',
4544 'CK': '202.65.32.0/19',
4545 'CL': '152.172.0.0/14',
4546 'CM': '102.244.0.0/14',
4547 'CN': '36.128.0.0/10',
4548 'CO': '181.240.0.0/12',
4549 'CR': '201.192.0.0/12',
4550 'CU': '152.206.0.0/15',
4551 'CV': '165.90.96.0/19',
4552 'CW': '190.88.128.0/17',
4553 'CY': '31.153.0.0/16',
4554 'CZ': '88.100.0.0/14',
4555 'DE': '53.0.0.0/8',
4556 'DJ': '197.241.0.0/17',
4557 'DK': '87.48.0.0/12',
4558 'DM': '192.243.48.0/20',
4559 'DO': '152.166.0.0/15',
4560 'DZ': '41.96.0.0/12',
4561 'EC': '186.68.0.0/15',
4562 'EE': '90.190.0.0/15',
4563 'EG': '156.160.0.0/11',
4564 'ER': '196.200.96.0/20',
4565 'ES': '88.0.0.0/11',
4566 'ET': '196.188.0.0/14',
4567 'EU': '2.16.0.0/13',
4568 'FI': '91.152.0.0/13',
4569 'FJ': '144.120.0.0/16',
4570 'FK': '80.73.208.0/21',
4571 'FM': '119.252.112.0/20',
4572 'FO': '88.85.32.0/19',
4573 'FR': '90.0.0.0/9',
4574 'GA': '41.158.0.0/15',
4575 'GB': '25.0.0.0/8',
4576 'GD': '74.122.88.0/21',
4577 'GE': '31.146.0.0/16',
4578 'GF': '161.22.64.0/18',
4579 'GG': '62.68.160.0/19',
4580 'GH': '154.160.0.0/12',
4581 'GI': '95.164.0.0/16',
4582 'GL': '88.83.0.0/19',
4583 'GM': '160.182.0.0/15',
4584 'GN': '197.149.192.0/18',
4585 'GP': '104.250.0.0/19',
4586 'GQ': '105.235.224.0/20',
4587 'GR': '94.64.0.0/13',
4588 'GT': '168.234.0.0/16',
4589 'GU': '168.123.0.0/16',
4590 'GW': '197.214.80.0/20',
4591 'GY': '181.41.64.0/18',
4592 'HK': '113.252.0.0/14',
4593 'HN': '181.210.0.0/16',
4594 'HR': '93.136.0.0/13',
4595 'HT': '148.102.128.0/17',
4596 'HU': '84.0.0.0/14',
4597 'ID': '39.192.0.0/10',
4598 'IE': '87.32.0.0/12',
4599 'IL': '79.176.0.0/13',
4600 'IM': '5.62.80.0/20',
4601 'IN': '117.192.0.0/10',
4602 'IO': '203.83.48.0/21',
4603 'IQ': '37.236.0.0/14',
4604 'IR': '2.176.0.0/12',
4605 'IS': '82.221.0.0/16',
4606 'IT': '79.0.0.0/10',
4607 'JE': '87.244.64.0/18',
4608 'JM': '72.27.0.0/17',
4609 'JO': '176.29.0.0/16',
4610 'JP': '133.0.0.0/8',
4611 'KE': '105.48.0.0/12',
4612 'KG': '158.181.128.0/17',
4613 'KH': '36.37.128.0/17',
4614 'KI': '103.25.140.0/22',
4615 'KM': '197.255.224.0/20',
4616 'KN': '198.167.192.0/19',
4617 'KP': '175.45.176.0/22',
4618 'KR': '175.192.0.0/10',
4619 'KW': '37.36.0.0/14',
4620 'KY': '64.96.0.0/15',
4621 'KZ': '2.72.0.0/13',
4622 'LA': '115.84.64.0/18',
4623 'LB': '178.135.0.0/16',
4624 'LC': '24.92.144.0/20',
4625 'LI': '82.117.0.0/19',
4626 'LK': '112.134.0.0/15',
4627 'LR': '102.183.0.0/16',
4628 'LS': '129.232.0.0/17',
4629 'LT': '78.56.0.0/13',
4630 'LU': '188.42.0.0/16',
4631 'LV': '46.109.0.0/16',
4632 'LY': '41.252.0.0/14',
4633 'MA': '105.128.0.0/11',
4634 'MC': '88.209.64.0/18',
4635 'MD': '37.246.0.0/16',
4636 'ME': '178.175.0.0/17',
4637 'MF': '74.112.232.0/21',
4638 'MG': '154.126.0.0/17',
4639 'MH': '117.103.88.0/21',
4640 'MK': '77.28.0.0/15',
4641 'ML': '154.118.128.0/18',
4642 'MM': '37.111.0.0/17',
4643 'MN': '49.0.128.0/17',
4644 'MO': '60.246.0.0/16',
4645 'MP': '202.88.64.0/20',
4646 'MQ': '109.203.224.0/19',
4647 'MR': '41.188.64.0/18',
4648 'MS': '208.90.112.0/22',
4649 'MT': '46.11.0.0/16',
4650 'MU': '105.16.0.0/12',
4651 'MV': '27.114.128.0/18',
4652 'MW': '102.70.0.0/15',
4653 'MX': '187.192.0.0/11',
4654 'MY': '175.136.0.0/13',
4655 'MZ': '197.218.0.0/15',
4656 'NA': '41.182.0.0/16',
4657 'NC': '101.101.0.0/18',
4658 'NE': '197.214.0.0/18',
4659 'NF': '203.17.240.0/22',
4660 'NG': '105.112.0.0/12',
4661 'NI': '186.76.0.0/15',
4662 'NL': '145.96.0.0/11',
4663 'NO': '84.208.0.0/13',
4664 'NP': '36.252.0.0/15',
4665 'NR': '203.98.224.0/19',
4666 'NU': '49.156.48.0/22',
4667 'NZ': '49.224.0.0/14',
4668 'OM': '5.36.0.0/15',
4669 'PA': '186.72.0.0/15',
4670 'PE': '186.160.0.0/14',
4671 'PF': '123.50.64.0/18',
4672 'PG': '124.240.192.0/19',
4673 'PH': '49.144.0.0/13',
4674 'PK': '39.32.0.0/11',
4675 'PL': '83.0.0.0/11',
4676 'PM': '70.36.0.0/20',
4677 'PR': '66.50.0.0/16',
4678 'PS': '188.161.0.0/16',
4679 'PT': '85.240.0.0/13',
4680 'PW': '202.124.224.0/20',
4681 'PY': '181.120.0.0/14',
4682 'QA': '37.210.0.0/15',
4683 'RE': '102.35.0.0/16',
4684 'RO': '79.112.0.0/13',
4685 'RS': '93.86.0.0/15',
4686 'RU': '5.136.0.0/13',
4687 'RW': '41.186.0.0/16',
4688 'SA': '188.48.0.0/13',
4689 'SB': '202.1.160.0/19',
4690 'SC': '154.192.0.0/11',
4691 'SD': '102.120.0.0/13',
4692 'SE': '78.64.0.0/12',
4693 'SG': '8.128.0.0/10',
4694 'SI': '188.196.0.0/14',
4695 'SK': '78.98.0.0/15',
4696 'SL': '102.143.0.0/17',
4697 'SM': '89.186.32.0/19',
4698 'SN': '41.82.0.0/15',
4699 'SO': '154.115.192.0/18',
4700 'SR': '186.179.128.0/17',
4701 'SS': '105.235.208.0/21',
4702 'ST': '197.159.160.0/19',
4703 'SV': '168.243.0.0/16',
4704 'SX': '190.102.0.0/20',
4705 'SY': '5.0.0.0/16',
4706 'SZ': '41.84.224.0/19',
4707 'TC': '65.255.48.0/20',
4708 'TD': '154.68.128.0/19',
4709 'TG': '196.168.0.0/14',
4710 'TH': '171.96.0.0/13',
4711 'TJ': '85.9.128.0/18',
4712 'TK': '27.96.24.0/21',
4713 'TL': '180.189.160.0/20',
4714 'TM': '95.85.96.0/19',
4715 'TN': '197.0.0.0/11',
4716 'TO': '175.176.144.0/21',
4717 'TR': '78.160.0.0/11',
4718 'TT': '186.44.0.0/15',
4719 'TV': '202.2.96.0/19',
4720 'TW': '120.96.0.0/11',
4721 'TZ': '156.156.0.0/14',
4722 'UA': '37.52.0.0/14',
4723 'UG': '102.80.0.0/13',
4724 'US': '6.0.0.0/8',
4725 'UY': '167.56.0.0/13',
4726 'UZ': '84.54.64.0/18',
4727 'VA': '212.77.0.0/19',
4728 'VC': '207.191.240.0/21',
4729 'VE': '186.88.0.0/13',
4730 'VG': '66.81.192.0/20',
4731 'VI': '146.226.0.0/16',
4732 'VN': '14.160.0.0/11',
4733 'VU': '202.80.32.0/20',
4734 'WF': '117.20.32.0/21',
4735 'WS': '202.4.32.0/19',
4736 'YE': '134.35.0.0/16',
4737 'YT': '41.242.116.0/22',
4738 'ZA': '41.0.0.0/11',
4739 'ZM': '102.144.0.0/13',
4740 'ZW': '102.177.192.0/18',
4741 }
4742
4743 @classmethod
4744 def random_ipv4(cls, code_or_block):
4745 if len(code_or_block) == 2:
4746 block = cls._country_ip_map.get(code_or_block.upper())
4747 if not block:
4748 return None
4749 else:
4750 block = code_or_block
4751 addr, preflen = block.split('/')
4752 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4753 addr_max = addr_min | (0xffffffff >> int(preflen))
4754 return str(socket.inet_ntoa(
4755 struct.pack('!L', random.randint(addr_min, addr_max))))
4756
4757
4758 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4759 def __init__(self, proxies=None):
4760 # Set default handlers
4761 for type in ('http', 'https'):
4762 setattr(self, '%s_open' % type,
4763 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4764 meth(r, proxy, type))
4765 urllib.request.ProxyHandler.__init__(self, proxies)
4766
4767 def proxy_open(self, req, proxy, type):
4768 req_proxy = req.headers.get('Ytdl-request-proxy')
4769 if req_proxy is not None:
4770 proxy = req_proxy
4771 del req.headers['Ytdl-request-proxy']
4772
4773 if proxy == '__noproxy__':
4774 return None # No Proxy
4775 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4776 req.add_header('Ytdl-socks-proxy', proxy)
4777 # yt-dlp's http/https handlers do wrapping the socket with socks
4778 return None
4779 return urllib.request.ProxyHandler.proxy_open(
4780 self, req, proxy, type)
4781
4782
4783 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4784 # released into Public Domain
4785 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4786
4787 def long_to_bytes(n, blocksize=0):
4788 """long_to_bytes(n:long, blocksize:int) : string
4789 Convert a long integer to a byte string.
4790
4791 If optional blocksize is given and greater than zero, pad the front of the
4792 byte string with binary zeros so that the length is a multiple of
4793 blocksize.
4794 """
4795 # after much testing, this algorithm was deemed to be the fastest
4796 s = b''
4797 n = int(n)
4798 while n > 0:
4799 s = struct.pack('>I', n & 0xffffffff) + s
4800 n = n >> 32
4801 # strip off leading zeros
4802 for i in range(len(s)):
4803 if s[i] != b'\000'[0]:
4804 break
4805 else:
4806 # only happens when n == 0
4807 s = b'\000'
4808 i = 0
4809 s = s[i:]
4810 # add back some pad bytes. this could be done more efficiently w.r.t. the
4811 # de-padding being done above, but sigh...
4812 if blocksize > 0 and len(s) % blocksize:
4813 s = (blocksize - len(s) % blocksize) * b'\000' + s
4814 return s
4815
4816
4817 def bytes_to_long(s):
4818 """bytes_to_long(string) : long
4819 Convert a byte string to a long integer.
4820
4821 This is (essentially) the inverse of long_to_bytes().
4822 """
4823 acc = 0
4824 length = len(s)
4825 if length % 4:
4826 extra = (4 - length % 4)
4827 s = b'\000' * extra + s
4828 length = length + extra
4829 for i in range(0, length, 4):
4830 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4831 return acc
4832
4833
4834 def ohdave_rsa_encrypt(data, exponent, modulus):
4835 '''
4836 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4837
4838 Input:
4839 data: data to encrypt, bytes-like object
4840 exponent, modulus: parameter e and N of RSA algorithm, both integer
4841 Output: hex string of encrypted data
4842
4843 Limitation: supports one block encryption only
4844 '''
4845
4846 payload = int(binascii.hexlify(data[::-1]), 16)
4847 encrypted = pow(payload, exponent, modulus)
4848 return '%x' % encrypted
4849
4850
4851 def pkcs1pad(data, length):
4852 """
4853 Padding input data with PKCS#1 scheme
4854
4855 @param {int[]} data input data
4856 @param {int} length target length
4857 @returns {int[]} padded data
4858 """
4859 if len(data) > length - 11:
4860 raise ValueError('Input data too long for PKCS#1 padding')
4861
4862 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4863 return [0, 2] + pseudo_random + [0] + data
4864
4865
4866 def _base_n_table(n, table):
4867 if not table and not n:
4868 raise ValueError('Either table or n must be specified')
4869 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4870
4871 if n and n != len(table):
4872 raise ValueError(f'base {n} exceeds table length {len(table)}')
4873 return table
4874
4875
4876 def encode_base_n(num, n=None, table=None):
4877 """Convert given int to a base-n string"""
4878 table = _base_n_table(n, table)
4879 if not num:
4880 return table[0]
4881
4882 result, base = '', len(table)
4883 while num:
4884 result = table[num % base] + result
4885 num = num // base
4886 return result
4887
4888
4889 def decode_base_n(string, n=None, table=None):
4890 """Convert given base-n string to int"""
4891 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4892 result, base = 0, len(table)
4893 for char in string:
4894 result = result * base + table[char]
4895 return result
4896
4897
4898 def decode_packed_codes(code):
4899 mobj = re.search(PACKED_CODES_RE, code)
4900 obfuscated_code, base, count, symbols = mobj.groups()
4901 base = int(base)
4902 count = int(count)
4903 symbols = symbols.split('|')
4904 symbol_table = {}
4905
4906 while count:
4907 count -= 1
4908 base_n_count = encode_base_n(count, base)
4909 symbol_table[base_n_count] = symbols[count] or base_n_count
4910
4911 return re.sub(
4912 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4913 obfuscated_code)
4914
4915
4916 def caesar(s, alphabet, shift):
4917 if shift == 0:
4918 return s
4919 l = len(alphabet)
4920 return ''.join(
4921 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4922 for c in s)
4923
4924
4925 def rot47(s):
4926 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4927
4928
4929 def parse_m3u8_attributes(attrib):
4930 info = {}
4931 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4932 if val.startswith('"'):
4933 val = val[1:-1]
4934 info[key] = val
4935 return info
4936
4937
4938 def urshift(val, n):
4939 return val >> n if val >= 0 else (val + 0x100000000) >> n
4940
4941
4942 def write_xattr(path, key, value):
4943 # Windows: Write xattrs to NTFS Alternate Data Streams:
4944 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4945 if compat_os_name == 'nt':
4946 assert ':' not in key
4947 assert os.path.exists(path)
4948
4949 try:
4950 with open(f'{path}:{key}', 'wb') as f:
4951 f.write(value)
4952 except OSError as e:
4953 raise XAttrMetadataError(e.errno, e.strerror)
4954 return
4955
4956 # UNIX Method 1. Use xattrs/pyxattrs modules
4957
4958 setxattr = None
4959 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4960 # Unicode arguments are not supported in pyxattr until version 0.5.0
4961 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4962 if version_tuple(xattr.__version__) >= (0, 5, 0):
4963 setxattr = xattr.set
4964 elif xattr:
4965 setxattr = xattr.setxattr
4966
4967 if setxattr:
4968 try:
4969 setxattr(path, key, value)
4970 except OSError as e:
4971 raise XAttrMetadataError(e.errno, e.strerror)
4972 return
4973
4974 # UNIX Method 2. Use setfattr/xattr executables
4975 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4976 else 'xattr' if check_executable('xattr', ['-h']) else None)
4977 if not exe:
4978 raise XAttrUnavailableError(
4979 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4980 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4981
4982 value = value.decode()
4983 try:
4984 _, stderr, returncode = Popen.run(
4985 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4986 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4987 except OSError as e:
4988 raise XAttrMetadataError(e.errno, e.strerror)
4989 if returncode:
4990 raise XAttrMetadataError(returncode, stderr)
4991
4992
4993 def random_birthday(year_field, month_field, day_field):
4994 start_date = datetime.date(1950, 1, 1)
4995 end_date = datetime.date(1995, 12, 31)
4996 offset = random.randint(0, (end_date - start_date).days)
4997 random_date = start_date + datetime.timedelta(offset)
4998 return {
4999 year_field: str(random_date.year),
5000 month_field: str(random_date.month),
5001 day_field: str(random_date.day),
5002 }
5003
5004
5005 def find_available_port(interface=''):
5006 try:
5007 with socket.socket() as sock:
5008 sock.bind((interface, 0))
5009 return sock.getsockname()[1]
5010 except OSError:
5011 return None
5012
5013
5014 # Templates for internet shortcut files, which are plain text files.
5015 DOT_URL_LINK_TEMPLATE = '''\
5016 [InternetShortcut]
5017 URL=%(url)s
5018 '''
5019
5020 DOT_WEBLOC_LINK_TEMPLATE = '''\
5021 <?xml version="1.0" encoding="UTF-8"?>
5022 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5023 <plist version="1.0">
5024 <dict>
5025 \t<key>URL</key>
5026 \t<string>%(url)s</string>
5027 </dict>
5028 </plist>
5029 '''
5030
5031 DOT_DESKTOP_LINK_TEMPLATE = '''\
5032 [Desktop Entry]
5033 Encoding=UTF-8
5034 Name=%(filename)s
5035 Type=Link
5036 URL=%(url)s
5037 Icon=text-html
5038 '''
5039
5040 LINK_TEMPLATES = {
5041 'url': DOT_URL_LINK_TEMPLATE,
5042 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5043 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5044 }
5045
5046
5047 def iri_to_uri(iri):
5048 """
5049 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5050
5051 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5052 """
5053
5054 iri_parts = urllib.parse.urlparse(iri)
5055
5056 if '[' in iri_parts.netloc:
5057 raise ValueError('IPv6 URIs are not, yet, supported.')
5058 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5059
5060 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5061
5062 net_location = ''
5063 if iri_parts.username:
5064 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5065 if iri_parts.password is not None:
5066 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5067 net_location += '@'
5068
5069 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5070 # The 'idna' encoding produces ASCII text.
5071 if iri_parts.port is not None and iri_parts.port != 80:
5072 net_location += ':' + str(iri_parts.port)
5073
5074 return urllib.parse.urlunparse(
5075 (iri_parts.scheme,
5076 net_location,
5077
5078 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5079
5080 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5081 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5082
5083 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5084 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5085
5086 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5087
5088 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5089
5090
5091 def to_high_limit_path(path):
5092 if sys.platform in ['win32', 'cygwin']:
5093 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5094 return '\\\\?\\' + os.path.abspath(path)
5095
5096 return path
5097
5098
5099 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5100 val = traversal.traverse_obj(obj, *variadic(field))
5101 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5102 return default
5103 return template % func(val)
5104
5105
5106 def clean_podcast_url(url):
5107 return re.sub(r'''(?x)
5108 (?:
5109 (?:
5110 chtbl\.com/track|
5111 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5112 play\.podtrac\.com
5113 )/[^/]+|
5114 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5115 flex\.acast\.com|
5116 pd(?:
5117 cn\.co| # https://podcorn.com/analytics-prefix/
5118 st\.fm # https://podsights.com/docs/
5119 )/e
5120 )/''', '', url)
5121
5122
5123 _HEX_TABLE = '0123456789abcdef'
5124
5125
5126 def random_uuidv4():
5127 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5128
5129
5130 def make_dir(path, to_screen=None):
5131 try:
5132 dn = os.path.dirname(path)
5133 if dn:
5134 os.makedirs(dn, exist_ok=True)
5135 return True
5136 except OSError as err:
5137 if callable(to_screen) is not None:
5138 to_screen(f'unable to create directory {err}')
5139 return False
5140
5141
5142 def get_executable_path():
5143 from ..update import _get_variant_and_executable_path
5144
5145 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5146
5147
5148 def get_user_config_dirs(package_name):
5149 # .config (e.g. ~/.config/package_name)
5150 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5151 yield os.path.join(xdg_config_home, package_name)
5152
5153 # appdata (%APPDATA%/package_name)
5154 appdata_dir = os.getenv('appdata')
5155 if appdata_dir:
5156 yield os.path.join(appdata_dir, package_name)
5157
5158 # home (~/.package_name)
5159 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5160
5161
5162 def get_system_config_dirs(package_name):
5163 # /etc/package_name
5164 yield os.path.join('/etc', package_name)
5165
5166
5167 def time_seconds(**kwargs):
5168 """
5169 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5170 """
5171 return time.time() + datetime.timedelta(**kwargs).total_seconds()
5172
5173
5174 # create a JSON Web Signature (jws) with HS256 algorithm
5175 # the resulting format is in JWS Compact Serialization
5176 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5177 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5178 def jwt_encode_hs256(payload_data, key, headers={}):
5179 header_data = {
5180 'alg': 'HS256',
5181 'typ': 'JWT',
5182 }
5183 if headers:
5184 header_data.update(headers)
5185 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5186 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5187 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5188 signature_b64 = base64.b64encode(h.digest())
5189 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5190 return token
5191
5192
5193 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5194 def jwt_decode_hs256(jwt):
5195 header_b64, payload_b64, signature_b64 = jwt.split('.')
5196 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5197 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5198 return payload_data
5199
5200
5201 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5202
5203
5204 @functools.cache
5205 def supports_terminal_sequences(stream):
5206 if compat_os_name == 'nt':
5207 if not WINDOWS_VT_MODE:
5208 return False
5209 elif not os.getenv('TERM'):
5210 return False
5211 try:
5212 return stream.isatty()
5213 except BaseException:
5214 return False
5215
5216
5217 def windows_enable_vt_mode():
5218 """Ref: https://bugs.python.org/issue30075 """
5219 if get_windows_version() < (10, 0, 10586):
5220 return
5221
5222 import ctypes
5223 import ctypes.wintypes
5224 import msvcrt
5225
5226 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5227
5228 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5229 handle = os.open('CONOUT$', os.O_RDWR)
5230 try:
5231 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5232 dw_original_mode = ctypes.wintypes.DWORD()
5233 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5234 if not success:
5235 raise Exception('GetConsoleMode failed')
5236
5237 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5238 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5239 if not success:
5240 raise Exception('SetConsoleMode failed')
5241 finally:
5242 os.close(handle)
5243
5244 global WINDOWS_VT_MODE
5245 WINDOWS_VT_MODE = True
5246 supports_terminal_sequences.cache_clear()
5247
5248
5249 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5250
5251
5252 def remove_terminal_sequences(string):
5253 return _terminal_sequences_re.sub('', string)
5254
5255
5256 def number_of_digits(number):
5257 return len('%d' % number)
5258
5259
5260 def join_nonempty(*values, delim='-', from_dict=None):
5261 if from_dict is not None:
5262 values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values)
5263 return delim.join(map(str, filter(None, values)))
5264
5265
5266 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5267 """
5268 Find the largest format dimensions in terms of video width and, for each thumbnail:
5269 * Modify the URL: Match the width with the provided regex and replace with the former width
5270 * Update dimensions
5271
5272 This function is useful with video services that scale the provided thumbnails on demand
5273 """
5274 _keys = ('width', 'height')
5275 max_dimensions = max(
5276 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5277 default=(0, 0))
5278 if not max_dimensions[0]:
5279 return thumbnails
5280 return [
5281 merge_dicts(
5282 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5283 dict(zip(_keys, max_dimensions)), thumbnail)
5284 for thumbnail in thumbnails
5285 ]
5286
5287
5288 def parse_http_range(range):
5289 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5290 if not range:
5291 return None, None, None
5292 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5293 if not crg:
5294 return None, None, None
5295 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5296
5297
5298 def read_stdin(what):
5299 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5300 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5301 return sys.stdin
5302
5303
5304 def determine_file_encoding(data):
5305 """
5306 Detect the text encoding used
5307 @returns (encoding, bytes to skip)
5308 """
5309
5310 # BOM marks are given priority over declarations
5311 for bom, enc in BOMS:
5312 if data.startswith(bom):
5313 return enc, len(bom)
5314
5315 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5316 # We ignore the endianness to get a good enough match
5317 data = data.replace(b'\0', b'')
5318 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5319 return mobj.group(1).decode() if mobj else None, 0
5320
5321
5322 class Config:
5323 own_args = None
5324 parsed_args = None
5325 filename = None
5326 __initialized = False
5327
5328 def __init__(self, parser, label=None):
5329 self.parser, self.label = parser, label
5330 self._loaded_paths, self.configs = set(), []
5331
5332 def init(self, args=None, filename=None):
5333 assert not self.__initialized
5334 self.own_args, self.filename = args, filename
5335 return self.load_configs()
5336
5337 def load_configs(self):
5338 directory = ''
5339 if self.filename:
5340 location = os.path.realpath(self.filename)
5341 directory = os.path.dirname(location)
5342 if location in self._loaded_paths:
5343 return False
5344 self._loaded_paths.add(location)
5345
5346 self.__initialized = True
5347 opts, _ = self.parser.parse_known_args(self.own_args)
5348 self.parsed_args = self.own_args
5349 for location in opts.config_locations or []:
5350 if location == '-':
5351 if location in self._loaded_paths:
5352 continue
5353 self._loaded_paths.add(location)
5354 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5355 continue
5356 location = os.path.join(directory, expand_path(location))
5357 if os.path.isdir(location):
5358 location = os.path.join(location, 'yt-dlp.conf')
5359 if not os.path.exists(location):
5360 self.parser.error(f'config location {location} does not exist')
5361 self.append_config(self.read_file(location), location)
5362 return True
5363
5364 def __str__(self):
5365 label = join_nonempty(
5366 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5367 delim=' ')
5368 return join_nonempty(
5369 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5370 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5371 delim='\n')
5372
5373 @staticmethod
5374 def read_file(filename, default=[]):
5375 try:
5376 optionf = open(filename, 'rb')
5377 except OSError:
5378 return default # silently skip if file is not present
5379 try:
5380 enc, skip = determine_file_encoding(optionf.read(512))
5381 optionf.seek(skip, io.SEEK_SET)
5382 except OSError:
5383 enc = None # silently skip read errors
5384 try:
5385 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5386 contents = optionf.read().decode(enc or preferredencoding())
5387 res = shlex.split(contents, comments=True)
5388 except Exception as err:
5389 raise ValueError(f'Unable to parse "{filename}": {err}')
5390 finally:
5391 optionf.close()
5392 return res
5393
5394 @staticmethod
5395 def hide_login_info(opts):
5396 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5397 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5398
5399 def _scrub_eq(o):
5400 m = eqre.match(o)
5401 if m:
5402 return m.group('key') + '=PRIVATE'
5403 else:
5404 return o
5405
5406 opts = list(map(_scrub_eq, opts))
5407 for idx, opt in enumerate(opts):
5408 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5409 opts[idx + 1] = 'PRIVATE'
5410 return opts
5411
5412 def append_config(self, *args, label=None):
5413 config = type(self)(self.parser, label)
5414 config._loaded_paths = self._loaded_paths
5415 if config.init(*args):
5416 self.configs.append(config)
5417
5418 @property
5419 def all_args(self):
5420 for config in reversed(self.configs):
5421 yield from config.all_args
5422 yield from self.parsed_args or []
5423
5424 def parse_known_args(self, **kwargs):
5425 return self.parser.parse_known_args(self.all_args, **kwargs)
5426
5427 def parse_args(self):
5428 return self.parser.parse_args(self.all_args)
5429
5430
5431 class WebSocketsWrapper:
5432 """Wraps websockets module to use in non-async scopes"""
5433 pool = None
5434
5435 def __init__(self, url, headers=None, connect=True):
5436 self.loop = asyncio.new_event_loop()
5437 # XXX: "loop" is deprecated
5438 self.conn = websockets.connect(
5439 url, extra_headers=headers, ping_interval=None,
5440 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5441 if connect:
5442 self.__enter__()
5443 atexit.register(self.__exit__, None, None, None)
5444
5445 def __enter__(self):
5446 if not self.pool:
5447 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5448 return self
5449
5450 def send(self, *args):
5451 self.run_with_loop(self.pool.send(*args), self.loop)
5452
5453 def recv(self, *args):
5454 return self.run_with_loop(self.pool.recv(*args), self.loop)
5455
5456 def __exit__(self, type, value, traceback):
5457 try:
5458 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5459 finally:
5460 self.loop.close()
5461 self._cancel_all_tasks(self.loop)
5462
5463 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5464 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5465 @staticmethod
5466 def run_with_loop(main, loop):
5467 if not asyncio.iscoroutine(main):
5468 raise ValueError(f'a coroutine was expected, got {main!r}')
5469
5470 try:
5471 return loop.run_until_complete(main)
5472 finally:
5473 loop.run_until_complete(loop.shutdown_asyncgens())
5474 if hasattr(loop, 'shutdown_default_executor'):
5475 loop.run_until_complete(loop.shutdown_default_executor())
5476
5477 @staticmethod
5478 def _cancel_all_tasks(loop):
5479 to_cancel = asyncio.all_tasks(loop)
5480
5481 if not to_cancel:
5482 return
5483
5484 for task in to_cancel:
5485 task.cancel()
5486
5487 # XXX: "loop" is removed in python 3.10+
5488 loop.run_until_complete(
5489 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5490
5491 for task in to_cancel:
5492 if task.cancelled():
5493 continue
5494 if task.exception() is not None:
5495 loop.call_exception_handler({
5496 'message': 'unhandled exception during asyncio.run() shutdown',
5497 'exception': task.exception(),
5498 'task': task,
5499 })
5500
5501
5502 def merge_headers(*dicts):
5503 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5504 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5505
5506
5507 def cached_method(f):
5508 """Cache a method"""
5509 signature = inspect.signature(f)
5510
5511 @functools.wraps(f)
5512 def wrapper(self, *args, **kwargs):
5513 bound_args = signature.bind(self, *args, **kwargs)
5514 bound_args.apply_defaults()
5515 key = tuple(bound_args.arguments.values())[1:]
5516
5517 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5518 if key not in cache:
5519 cache[key] = f(self, *args, **kwargs)
5520 return cache[key]
5521 return wrapper
5522
5523
5524 class classproperty:
5525 """property access for class methods with optional caching"""
5526 def __new__(cls, func=None, *args, **kwargs):
5527 if not func:
5528 return functools.partial(cls, *args, **kwargs)
5529 return super().__new__(cls)
5530
5531 def __init__(self, func, *, cache=False):
5532 functools.update_wrapper(self, func)
5533 self.func = func
5534 self._cache = {} if cache else None
5535
5536 def __get__(self, _, cls):
5537 if self._cache is None:
5538 return self.func(cls)
5539 elif cls not in self._cache:
5540 self._cache[cls] = self.func(cls)
5541 return self._cache[cls]
5542
5543
5544 class function_with_repr:
5545 def __init__(self, func, repr_=None):
5546 functools.update_wrapper(self, func)
5547 self.func, self.__repr = func, repr_
5548
5549 def __call__(self, *args, **kwargs):
5550 return self.func(*args, **kwargs)
5551
5552 def __repr__(self):
5553 if self.__repr:
5554 return self.__repr
5555 return f'{self.func.__module__}.{self.func.__qualname__}'
5556
5557
5558 class Namespace(types.SimpleNamespace):
5559 """Immutable namespace"""
5560
5561 def __iter__(self):
5562 return iter(self.__dict__.values())
5563
5564 @property
5565 def items_(self):
5566 return self.__dict__.items()
5567
5568
5569 MEDIA_EXTENSIONS = Namespace(
5570 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5571 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5572 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5573 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
5574 thumbnails=('jpg', 'png', 'webp'),
5575 storyboards=('mhtml', ),
5576 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5577 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5578 )
5579 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5580 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5581
5582 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5583
5584
5585 class RetryManager:
5586 """Usage:
5587 for retry in RetryManager(...):
5588 try:
5589 ...
5590 except SomeException as err:
5591 retry.error = err
5592 continue
5593 """
5594 attempt, _error = 0, None
5595
5596 def __init__(self, _retries, _error_callback, **kwargs):
5597 self.retries = _retries or 0
5598 self.error_callback = functools.partial(_error_callback, **kwargs)
5599
5600 def _should_retry(self):
5601 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5602
5603 @property
5604 def error(self):
5605 if self._error is NO_DEFAULT:
5606 return None
5607 return self._error
5608
5609 @error.setter
5610 def error(self, value):
5611 self._error = value
5612
5613 def __iter__(self):
5614 while self._should_retry():
5615 self.error = NO_DEFAULT
5616 self.attempt += 1
5617 yield self
5618 if self.error:
5619 self.error_callback(self.error, self.attempt, self.retries)
5620
5621 @staticmethod
5622 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5623 """Utility function for reporting retries"""
5624 if count > retries:
5625 if error:
5626 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5627 raise e
5628
5629 if not count:
5630 return warn(e)
5631 elif isinstance(e, ExtractorError):
5632 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5633 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5634
5635 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5636 if delay:
5637 info(f'Sleeping {delay:.2f} seconds ...')
5638 time.sleep(delay)
5639
5640
5641 def make_archive_id(ie, video_id):
5642 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5643 return f'{ie_key.lower()} {video_id}'
5644
5645
5646 def truncate_string(s, left, right=0):
5647 assert left > 3 and right >= 0
5648 if s is None or len(s) <= left + right:
5649 return s
5650 return f'{s[:left-3]}...{s[-right:] if right else ""}'
5651
5652
5653 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5654 assert 'all' in alias_dict, '"all" alias is required'
5655 requested = list(start or [])
5656 for val in options:
5657 discard = val.startswith('-')
5658 if discard:
5659 val = val[1:]
5660
5661 if val in alias_dict:
5662 val = alias_dict[val] if not discard else [
5663 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5664 # NB: Do not allow regex in aliases for performance
5665 requested = orderedSet_from_options(val, alias_dict, start=requested)
5666 continue
5667
5668 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5669 else [val] if val in alias_dict['all'] else None)
5670 if current is None:
5671 raise ValueError(val)
5672
5673 if discard:
5674 for item in current:
5675 while item in requested:
5676 requested.remove(item)
5677 else:
5678 requested.extend(current)
5679
5680 return orderedSet(requested)
5681
5682
5683 # TODO: Rewrite
5684 class FormatSorter:
5685 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
5686
5687 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
5688 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
5689 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
5690 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
5691 'height', 'width', 'proto', 'vext', 'abr', 'aext',
5692 'fps', 'fs_approx', 'source', 'id')
5693
5694 settings = {
5695 'vcodec': {'type': 'ordered', 'regex': True,
5696 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
5697 'acodec': {'type': 'ordered', 'regex': True,
5698 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
5699 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
5700 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
5701 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
5702 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
5703 'vext': {'type': 'ordered', 'field': 'video_ext',
5704 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
5705 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
5706 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
5707 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
5708 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
5709 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
5710 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
5711 'field': ('vcodec', 'acodec'),
5712 'function': lambda it: int(any(v != 'none' for v in it))},
5713 'ie_pref': {'priority': True, 'type': 'extractor'},
5714 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
5715 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
5716 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
5717 'quality': {'convert': 'float', 'default': -1},
5718 'filesize': {'convert': 'bytes'},
5719 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
5720 'id': {'convert': 'string', 'field': 'format_id'},
5721 'height': {'convert': 'float_none'},
5722 'width': {'convert': 'float_none'},
5723 'fps': {'convert': 'float_none'},
5724 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
5725 'tbr': {'convert': 'float_none'},
5726 'vbr': {'convert': 'float_none'},
5727 'abr': {'convert': 'float_none'},
5728 'asr': {'convert': 'float_none'},
5729 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
5730
5731 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
5732 'br': {'type': 'multiple', 'field': ('tbr', 'vbr', 'abr'),
5733 'function': lambda it: next(filter(None, it), None)},
5734 'size': {'type': 'multiple', 'field': ('filesize', 'fs_approx'),
5735 'function': lambda it: next(filter(None, it), None)},
5736 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
5737 'res': {'type': 'multiple', 'field': ('height', 'width'),
5738 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
5739
5740 # Actual field names
5741 'format_id': {'type': 'alias', 'field': 'id'},
5742 'preference': {'type': 'alias', 'field': 'ie_pref'},
5743 'language_preference': {'type': 'alias', 'field': 'lang'},
5744 'source_preference': {'type': 'alias', 'field': 'source'},
5745 'protocol': {'type': 'alias', 'field': 'proto'},
5746 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
5747 'audio_channels': {'type': 'alias', 'field': 'channels'},
5748
5749 # Deprecated
5750 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
5751 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
5752 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
5753 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
5754 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
5755 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
5756 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
5757 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
5758 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
5759 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
5760 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
5761 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
5762 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
5763 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
5764 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5765 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
5766 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5767 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
5768 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5769 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
5770 }
5771
5772 def __init__(self, ydl, field_preference):
5773 self.ydl = ydl
5774 self._order = []
5775 self.evaluate_params(self.ydl.params, field_preference)
5776 if ydl.params.get('verbose'):
5777 self.print_verbose_info(self.ydl.write_debug)
5778
5779 def _get_field_setting(self, field, key):
5780 if field not in self.settings:
5781 if key in ('forced', 'priority'):
5782 return False
5783 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
5784 'deprecated and may be removed in a future version')
5785 self.settings[field] = {}
5786 propObj = self.settings[field]
5787 if key not in propObj:
5788 type = propObj.get('type')
5789 if key == 'field':
5790 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
5791 elif key == 'convert':
5792 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
5793 else:
5794 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
5795 propObj[key] = default
5796 return propObj[key]
5797
5798 def _resolve_field_value(self, field, value, convertNone=False):
5799 if value is None:
5800 if not convertNone:
5801 return None
5802 else:
5803 value = value.lower()
5804 conversion = self._get_field_setting(field, 'convert')
5805 if conversion == 'ignore':
5806 return None
5807 if conversion == 'string':
5808 return value
5809 elif conversion == 'float_none':
5810 return float_or_none(value)
5811 elif conversion == 'bytes':
5812 return parse_bytes(value)
5813 elif conversion == 'order':
5814 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
5815 use_regex = self._get_field_setting(field, 'regex')
5816 list_length = len(order_list)
5817 empty_pos = order_list.index('') if '' in order_list else list_length + 1
5818 if use_regex and value is not None:
5819 for i, regex in enumerate(order_list):
5820 if regex and re.match(regex, value):
5821 return list_length - i
5822 return list_length - empty_pos # not in list
5823 else: # not regex or value = None
5824 return list_length - (order_list.index(value) if value in order_list else empty_pos)
5825 else:
5826 if value.isnumeric():
5827 return float(value)
5828 else:
5829 self.settings[field]['convert'] = 'string'
5830 return value
5831
5832 def evaluate_params(self, params, sort_extractor):
5833 self._use_free_order = params.get('prefer_free_formats', False)
5834 self._sort_user = params.get('format_sort', [])
5835 self._sort_extractor = sort_extractor
5836
5837 def add_item(field, reverse, closest, limit_text):
5838 field = field.lower()
5839 if field in self._order:
5840 return
5841 self._order.append(field)
5842 limit = self._resolve_field_value(field, limit_text)
5843 data = {
5844 'reverse': reverse,
5845 'closest': False if limit is None else closest,
5846 'limit_text': limit_text,
5847 'limit': limit}
5848 if field in self.settings:
5849 self.settings[field].update(data)
5850 else:
5851 self.settings[field] = data
5852
5853 sort_list = (
5854 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
5855 + (tuple() if params.get('format_sort_force', False)
5856 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
5857 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
5858
5859 for item in sort_list:
5860 match = re.match(self.regex, item)
5861 if match is None:
5862 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
5863 field = match.group('field')
5864 if field is None:
5865 continue
5866 if self._get_field_setting(field, 'type') == 'alias':
5867 alias, field = field, self._get_field_setting(field, 'field')
5868 if self._get_field_setting(alias, 'deprecated'):
5869 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
5870 f'be removed in a future version. Please use {field} instead')
5871 reverse = match.group('reverse') is not None
5872 closest = match.group('separator') == '~'
5873 limit_text = match.group('limit')
5874
5875 has_limit = limit_text is not None
5876 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
5877 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
5878
5879 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
5880 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
5881 limit_count = len(limits)
5882 for (i, f) in enumerate(fields):
5883 add_item(f, reverse, closest,
5884 limits[i] if i < limit_count
5885 else limits[0] if has_limit and not has_multiple_limits
5886 else None)
5887
5888 def print_verbose_info(self, write_debug):
5889 if self._sort_user:
5890 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
5891 if self._sort_extractor:
5892 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
5893 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
5894 '+' if self._get_field_setting(field, 'reverse') else '', field,
5895 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
5896 self._get_field_setting(field, 'limit_text'),
5897 self._get_field_setting(field, 'limit'))
5898 if self._get_field_setting(field, 'limit_text') is not None else '')
5899 for field in self._order if self._get_field_setting(field, 'visible')]))
5900
5901 def _calculate_field_preference_from_value(self, format, field, type, value):
5902 reverse = self._get_field_setting(field, 'reverse')
5903 closest = self._get_field_setting(field, 'closest')
5904 limit = self._get_field_setting(field, 'limit')
5905
5906 if type == 'extractor':
5907 maximum = self._get_field_setting(field, 'max')
5908 if value is None or (maximum is not None and value >= maximum):
5909 value = -1
5910 elif type == 'boolean':
5911 in_list = self._get_field_setting(field, 'in_list')
5912 not_in_list = self._get_field_setting(field, 'not_in_list')
5913 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
5914 elif type == 'ordered':
5915 value = self._resolve_field_value(field, value, True)
5916
5917 # try to convert to number
5918 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
5919 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
5920 if is_num:
5921 value = val_num
5922
5923 return ((-10, 0) if value is None
5924 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
5925 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
5926 else (0, value, 0) if not reverse and (limit is None or value <= limit)
5927 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
5928 else (-1, value, 0))
5929
5930 def _calculate_field_preference(self, format, field):
5931 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
5932 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
5933 if type == 'multiple':
5934 type = 'field' # Only 'field' is allowed in multiple for now
5935 actual_fields = self._get_field_setting(field, 'field')
5936
5937 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
5938 else:
5939 value = get_value(field)
5940 return self._calculate_field_preference_from_value(format, field, type, value)
5941
5942 def calculate_preference(self, format):
5943 # Determine missing protocol
5944 if not format.get('protocol'):
5945 format['protocol'] = determine_protocol(format)
5946
5947 # Determine missing ext
5948 if not format.get('ext') and 'url' in format:
5949 format['ext'] = determine_ext(format['url'])
5950 if format.get('vcodec') == 'none':
5951 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
5952 format['video_ext'] = 'none'
5953 else:
5954 format['video_ext'] = format['ext']
5955 format['audio_ext'] = 'none'
5956 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
5957 # format['preference'] = -1000
5958
5959 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
5960 # HEVC-over-FLV is out-of-spec by FLV's original spec
5961 # ref. https://trac.ffmpeg.org/ticket/6389
5962 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
5963 format['preference'] = -100
5964
5965 # Determine missing bitrates
5966 if format.get('vcodec') == 'none':
5967 format['vbr'] = 0
5968 if format.get('acodec') == 'none':
5969 format['abr'] = 0
5970 if not format.get('vbr') and format.get('vcodec') != 'none':
5971 format['vbr'] = try_call(lambda: format['tbr'] - format['abr']) or None
5972 if not format.get('abr') and format.get('acodec') != 'none':
5973 format['abr'] = try_call(lambda: format['tbr'] - format['vbr']) or None
5974 if not format.get('tbr'):
5975 format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None
5976
5977 return tuple(self._calculate_field_preference(format, field) for field in self._order)