]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
Add option `--break-match-filters`
[yt-dlp.git] / yt_dlp / utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import html.entities
18 import html.parser
19 import http.client
20 import http.cookiejar
21 import inspect
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import mimetypes
28 import operator
29 import os
30 import platform
31 import random
32 import re
33 import shlex
34 import socket
35 import ssl
36 import struct
37 import subprocess
38 import sys
39 import tempfile
40 import time
41 import traceback
42 import types
43 import unicodedata
44 import urllib.error
45 import urllib.parse
46 import urllib.request
47 import xml.etree.ElementTree
48 import zlib
49
50 from .compat import functools # isort: split
51 from .compat import (
52 compat_etree_fromstring,
53 compat_expanduser,
54 compat_HTMLParseError,
55 compat_os_name,
56 compat_shlex_quote,
57 )
58 from .dependencies import brotli, certifi, websockets, xattr
59 from .socks import ProxyType, sockssocket
60
61
62 def register_socks_protocols():
63 # "Register" SOCKS protocols
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
69
70
71 # This is not clearly defined otherwise
72 compiled_regex_type = type(re.compile(''))
73
74
75 def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
120 SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122 ]
123 if brotli:
124 SUPPORTED_ENCODINGS.append('br')
125
126 std_headers = {
127 'User-Agent': random_user_agent(),
128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
129 'Accept-Language': 'en-us,en;q=0.5',
130 'Sec-Fetch-Mode': 'navigate',
131 }
132
133
134 USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136 }
137
138
139 NO_DEFAULT = object()
140 IDENTITY = lambda x: x
141
142 ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
146 MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
151 # these follow the genitive grammatical case (dopełniacz)
152 # some websites might be using nominative, which will require another month list
153 # https://en.wikibooks.org/wiki/Polish/Noun_cases
154 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
155 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
156 }
157
158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
159 TIMEZONE_NAMES = {
160 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
161 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
162 'EST': -5, 'EDT': -4, # Eastern
163 'CST': -6, 'CDT': -5, # Central
164 'MST': -7, 'MDT': -6, # Mountain
165 'PST': -8, 'PDT': -7 # Pacific
166 }
167
168 # needed for sanitizing filenames in restricted mode
169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
170 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
171 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
172
173 DATE_FORMATS = (
174 '%d %B %Y',
175 '%d %b %Y',
176 '%B %d %Y',
177 '%B %dst %Y',
178 '%B %dnd %Y',
179 '%B %drd %Y',
180 '%B %dth %Y',
181 '%b %d %Y',
182 '%b %dst %Y',
183 '%b %dnd %Y',
184 '%b %drd %Y',
185 '%b %dth %Y',
186 '%b %dst %Y %I:%M',
187 '%b %dnd %Y %I:%M',
188 '%b %drd %Y %I:%M',
189 '%b %dth %Y %I:%M',
190 '%Y %m %d',
191 '%Y-%m-%d',
192 '%Y.%m.%d.',
193 '%Y/%m/%d',
194 '%Y/%m/%d %H:%M',
195 '%Y/%m/%d %H:%M:%S',
196 '%Y%m%d%H%M',
197 '%Y%m%d%H%M%S',
198 '%Y%m%d',
199 '%Y-%m-%d %H:%M',
200 '%Y-%m-%d %H:%M:%S',
201 '%Y-%m-%d %H:%M:%S.%f',
202 '%Y-%m-%d %H:%M:%S:%f',
203 '%d.%m.%Y %H:%M',
204 '%d.%m.%Y %H.%M',
205 '%Y-%m-%dT%H:%M:%SZ',
206 '%Y-%m-%dT%H:%M:%S.%fZ',
207 '%Y-%m-%dT%H:%M:%S.%f0Z',
208 '%Y-%m-%dT%H:%M:%S',
209 '%Y-%m-%dT%H:%M:%S.%f',
210 '%Y-%m-%dT%H:%M',
211 '%b %d %Y at %H:%M',
212 '%b %d %Y at %H:%M:%S',
213 '%B %d %Y at %H:%M',
214 '%B %d %Y at %H:%M:%S',
215 '%H:%M %d-%b-%Y',
216 )
217
218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
219 DATE_FORMATS_DAY_FIRST.extend([
220 '%d-%m-%Y',
221 '%d.%m.%Y',
222 '%d.%m.%y',
223 '%d/%m/%Y',
224 '%d/%m/%y',
225 '%d/%m/%Y %H:%M:%S',
226 '%d-%m-%Y %H:%M',
227 ])
228
229 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
230 DATE_FORMATS_MONTH_FIRST.extend([
231 '%m-%d-%Y',
232 '%m.%d.%Y',
233 '%m/%d/%Y',
234 '%m/%d/%y',
235 '%m/%d/%Y %H:%M:%S',
236 ])
237
238 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
239 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
240
241 NUMBER_RE = r'\d+(?:\.\d+)?'
242
243
244 @functools.cache
245 def preferredencoding():
246 """Get preferred encoding.
247
248 Returns the best encoding scheme for the system, based on
249 locale.getpreferredencoding() and some further tweaks.
250 """
251 try:
252 pref = locale.getpreferredencoding()
253 'TEST'.encode(pref)
254 except Exception:
255 pref = 'UTF-8'
256
257 return pref
258
259
260 def write_json_file(obj, fn):
261 """ Encode obj as JSON and write it to fn, atomically if possible """
262
263 tf = tempfile.NamedTemporaryFile(
264 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
265 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
266
267 try:
268 with tf:
269 json.dump(obj, tf, ensure_ascii=False)
270 if sys.platform == 'win32':
271 # Need to remove existing file on Windows, else os.rename raises
272 # WindowsError or FileExistsError.
273 with contextlib.suppress(OSError):
274 os.unlink(fn)
275 with contextlib.suppress(OSError):
276 mask = os.umask(0)
277 os.umask(mask)
278 os.chmod(tf.name, 0o666 & ~mask)
279 os.rename(tf.name, fn)
280 except Exception:
281 with contextlib.suppress(OSError):
282 os.remove(tf.name)
283 raise
284
285
286 def find_xpath_attr(node, xpath, key, val=None):
287 """ Find the xpath xpath[@key=val] """
288 assert re.match(r'^[a-zA-Z_-]+$', key)
289 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
290 return node.find(expr)
291
292 # On python2.6 the xml.etree.ElementTree.Element methods don't support
293 # the namespace parameter
294
295
296 def xpath_with_ns(path, ns_map):
297 components = [c.split(':') for c in path.split('/')]
298 replaced = []
299 for c in components:
300 if len(c) == 1:
301 replaced.append(c[0])
302 else:
303 ns, tag = c
304 replaced.append('{%s}%s' % (ns_map[ns], tag))
305 return '/'.join(replaced)
306
307
308 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
309 def _find_xpath(xpath):
310 return node.find(xpath)
311
312 if isinstance(xpath, str):
313 n = _find_xpath(xpath)
314 else:
315 for xp in xpath:
316 n = _find_xpath(xp)
317 if n is not None:
318 break
319
320 if n is None:
321 if default is not NO_DEFAULT:
322 return default
323 elif fatal:
324 name = xpath if name is None else name
325 raise ExtractorError('Could not find XML element %s' % name)
326 else:
327 return None
328 return n
329
330
331 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
332 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
333 if n is None or n == default:
334 return n
335 if n.text is None:
336 if default is not NO_DEFAULT:
337 return default
338 elif fatal:
339 name = xpath if name is None else name
340 raise ExtractorError('Could not find XML element\'s text %s' % name)
341 else:
342 return None
343 return n.text
344
345
346 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
347 n = find_xpath_attr(node, xpath, key)
348 if n is None:
349 if default is not NO_DEFAULT:
350 return default
351 elif fatal:
352 name = f'{xpath}[@{key}]' if name is None else name
353 raise ExtractorError('Could not find XML attribute %s' % name)
354 else:
355 return None
356 return n.attrib[key]
357
358
359 def get_element_by_id(id, html, **kwargs):
360 """Return the content of the tag with the specified ID in the passed HTML document"""
361 return get_element_by_attribute('id', id, html, **kwargs)
362
363
364 def get_element_html_by_id(id, html, **kwargs):
365 """Return the html of the tag with the specified ID in the passed HTML document"""
366 return get_element_html_by_attribute('id', id, html, **kwargs)
367
368
369 def get_element_by_class(class_name, html):
370 """Return the content of the first tag with the specified class in the passed HTML document"""
371 retval = get_elements_by_class(class_name, html)
372 return retval[0] if retval else None
373
374
375 def get_element_html_by_class(class_name, html):
376 """Return the html of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_html_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
381 def get_element_by_attribute(attribute, value, html, **kwargs):
382 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
383 return retval[0] if retval else None
384
385
386 def get_element_html_by_attribute(attribute, value, html, **kargs):
387 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
388 return retval[0] if retval else None
389
390
391 def get_elements_by_class(class_name, html, **kargs):
392 """Return the content of all tags with the specified class in the passed HTML document as a list"""
393 return get_elements_by_attribute(
394 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
395 html, escape_value=False)
396
397
398 def get_elements_html_by_class(class_name, html):
399 """Return the html of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_html_by_attribute(
401 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
402 html, escape_value=False)
403
404
405 def get_elements_by_attribute(*args, **kwargs):
406 """Return the content of the tag with the specified attribute in the passed HTML document"""
407 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
408
409
410 def get_elements_html_by_attribute(*args, **kwargs):
411 """Return the html of the tag with the specified attribute in the passed HTML document"""
412 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
413
414
415 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
416 """
417 Return the text (content) and the html (whole) of the tag with the specified
418 attribute in the passed HTML document
419 """
420 if not value:
421 return
422
423 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
424
425 value = re.escape(value) if escape_value else value
426
427 partial_element_re = rf'''(?x)
428 <(?P<tag>{tag})
429 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
430 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
431 '''
432
433 for m in re.finditer(partial_element_re, html):
434 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
435
436 yield (
437 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
438 whole
439 )
440
441
442 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
443 """
444 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
445 closing tag for the first opening tag it has encountered, and can be used
446 as a context manager
447 """
448
449 class HTMLBreakOnClosingTagException(Exception):
450 pass
451
452 def __init__(self):
453 self.tagstack = collections.deque()
454 html.parser.HTMLParser.__init__(self)
455
456 def __enter__(self):
457 return self
458
459 def __exit__(self, *_):
460 self.close()
461
462 def close(self):
463 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
464 # so data remains buffered; we no longer have any interest in it, thus
465 # override this method to discard it
466 pass
467
468 def handle_starttag(self, tag, _):
469 self.tagstack.append(tag)
470
471 def handle_endtag(self, tag):
472 if not self.tagstack:
473 raise compat_HTMLParseError('no tags in the stack')
474 while self.tagstack:
475 inner_tag = self.tagstack.pop()
476 if inner_tag == tag:
477 break
478 else:
479 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
480 if not self.tagstack:
481 raise self.HTMLBreakOnClosingTagException()
482
483
484 # XXX: This should be far less strict
485 def get_element_text_and_html_by_tag(tag, html):
486 """
487 For the first element with the specified tag in the passed HTML document
488 return its' content (text) and the whole element (html)
489 """
490 def find_or_raise(haystack, needle, exc):
491 try:
492 return haystack.index(needle)
493 except ValueError:
494 raise exc
495 closing_tag = f'</{tag}>'
496 whole_start = find_or_raise(
497 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
498 content_start = find_or_raise(
499 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
500 content_start += whole_start + 1
501 with HTMLBreakOnClosingTagParser() as parser:
502 parser.feed(html[whole_start:content_start])
503 if not parser.tagstack or parser.tagstack[0] != tag:
504 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
505 offset = content_start
506 while offset < len(html):
507 next_closing_tag_start = find_or_raise(
508 html[offset:], closing_tag,
509 compat_HTMLParseError(f'closing {tag} tag not found'))
510 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
511 try:
512 parser.feed(html[offset:offset + next_closing_tag_end])
513 offset += next_closing_tag_end
514 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
515 return html[content_start:offset + next_closing_tag_start], \
516 html[whole_start:offset + next_closing_tag_end]
517 raise compat_HTMLParseError('unexpected end of html')
518
519
520 class HTMLAttributeParser(html.parser.HTMLParser):
521 """Trivial HTML parser to gather the attributes for a single element"""
522
523 def __init__(self):
524 self.attrs = {}
525 html.parser.HTMLParser.__init__(self)
526
527 def handle_starttag(self, tag, attrs):
528 self.attrs = dict(attrs)
529 raise compat_HTMLParseError('done')
530
531
532 class HTMLListAttrsParser(html.parser.HTMLParser):
533 """HTML parser to gather the attributes for the elements of a list"""
534
535 def __init__(self):
536 html.parser.HTMLParser.__init__(self)
537 self.items = []
538 self._level = 0
539
540 def handle_starttag(self, tag, attrs):
541 if tag == 'li' and self._level == 0:
542 self.items.append(dict(attrs))
543 self._level += 1
544
545 def handle_endtag(self, tag):
546 self._level -= 1
547
548
549 def extract_attributes(html_element):
550 """Given a string for an HTML element such as
551 <el
552 a="foo" B="bar" c="&98;az" d=boz
553 empty= noval entity="&amp;"
554 sq='"' dq="'"
555 >
556 Decode and return a dictionary of attributes.
557 {
558 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
559 'empty': '', 'noval': None, 'entity': '&',
560 'sq': '"', 'dq': '\''
561 }.
562 """
563 parser = HTMLAttributeParser()
564 with contextlib.suppress(compat_HTMLParseError):
565 parser.feed(html_element)
566 parser.close()
567 return parser.attrs
568
569
570 def parse_list(webpage):
571 """Given a string for an series of HTML <li> elements,
572 return a dictionary of their attributes"""
573 parser = HTMLListAttrsParser()
574 parser.feed(webpage)
575 parser.close()
576 return parser.items
577
578
579 def clean_html(html):
580 """Clean an HTML snippet into a readable string"""
581
582 if html is None: # Convenience for sanitizing descriptions etc.
583 return html
584
585 html = re.sub(r'\s+', ' ', html)
586 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
587 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
588 # Strip html tags
589 html = re.sub('<.*?>', '', html)
590 # Replace html entities
591 html = unescapeHTML(html)
592 return html.strip()
593
594
595 class LenientJSONDecoder(json.JSONDecoder):
596 # TODO: Write tests
597 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
599 self._close_attempts = 2 * close_objects
600 super().__init__(*args, **kwargs)
601
602 @staticmethod
603 def _close_object(err):
604 doc = err.doc[:err.pos]
605 # We need to add comma first to get the correct error message
606 if err.msg.startswith('Expecting \',\''):
607 return doc + ','
608 elif not doc.endswith(','):
609 return
610
611 if err.msg.startswith('Expecting property name'):
612 return doc[:-1] + '}'
613 elif err.msg.startswith('Expecting value'):
614 return doc[:-1] + ']'
615
616 def decode(self, s):
617 if self.transform_source:
618 s = self.transform_source(s)
619 for attempt in range(self._close_attempts + 1):
620 try:
621 if self.ignore_extra:
622 return self.raw_decode(s.lstrip())[0]
623 return super().decode(s)
624 except json.JSONDecodeError as e:
625 if e.pos is None:
626 raise
627 elif attempt < self._close_attempts:
628 s = self._close_object(e)
629 if s is not None:
630 continue
631 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
632 assert False, 'Too many attempts to decode JSON'
633
634
635 def sanitize_open(filename, open_mode):
636 """Try to open the given filename, and slightly tweak it if this fails.
637
638 Attempts to open the given filename. If this fails, it tries to change
639 the filename slightly, step by step, until it's either able to open it
640 or it fails and raises a final exception, like the standard open()
641 function.
642
643 It returns the tuple (stream, definitive_file_name).
644 """
645 if filename == '-':
646 if sys.platform == 'win32':
647 import msvcrt
648
649 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
650 with contextlib.suppress(io.UnsupportedOperation):
651 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
652 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
653
654 for attempt in range(2):
655 try:
656 try:
657 if sys.platform == 'win32':
658 # FIXME: An exclusive lock also locks the file from being read.
659 # Since windows locks are mandatory, don't lock the file on windows (for now).
660 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
661 raise LockingUnsupportedError()
662 stream = locked_file(filename, open_mode, block=False).__enter__()
663 except OSError:
664 stream = open(filename, open_mode)
665 return stream, filename
666 except OSError as err:
667 if attempt or err.errno in (errno.EACCES,):
668 raise
669 old_filename, filename = filename, sanitize_path(filename)
670 if old_filename == filename:
671 raise
672
673
674 def timeconvert(timestr):
675 """Convert RFC 2822 defined time string into system timestamp"""
676 timestamp = None
677 timetuple = email.utils.parsedate_tz(timestr)
678 if timetuple is not None:
679 timestamp = email.utils.mktime_tz(timetuple)
680 return timestamp
681
682
683 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
684 """Sanitizes a string so it could be used as part of a filename.
685 @param restricted Use a stricter subset of allowed characters
686 @param is_id Whether this is an ID that should be kept unchanged if possible.
687 If unset, yt-dlp's new sanitization rules are in effect
688 """
689 if s == '':
690 return ''
691
692 def replace_insane(char):
693 if restricted and char in ACCENT_CHARS:
694 return ACCENT_CHARS[char]
695 elif not restricted and char == '\n':
696 return '\0 '
697 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
698 # Replace with their full-width unicode counterparts
699 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
700 elif char == '?' or ord(char) < 32 or ord(char) == 127:
701 return ''
702 elif char == '"':
703 return '' if restricted else '\''
704 elif char == ':':
705 return '\0_\0-' if restricted else '\0 \0-'
706 elif char in '\\/|*<>':
707 return '\0_'
708 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
709 return '\0_'
710 return char
711
712 # Replace look-alike Unicode glyphs
713 if restricted and (is_id is NO_DEFAULT or not is_id):
714 s = unicodedata.normalize('NFKC', s)
715 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
716 result = ''.join(map(replace_insane, s))
717 if is_id is NO_DEFAULT:
718 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
719 STRIP_RE = r'(?:\0.|[ _-])*'
720 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
721 result = result.replace('\0', '') or '_'
722
723 if not is_id:
724 while '__' in result:
725 result = result.replace('__', '_')
726 result = result.strip('_')
727 # Common case of "Foreign band name - English song title"
728 if restricted and result.startswith('-_'):
729 result = result[2:]
730 if result.startswith('-'):
731 result = '_' + result[len('-'):]
732 result = result.lstrip('.')
733 if not result:
734 result = '_'
735 return result
736
737
738 def sanitize_path(s, force=False):
739 """Sanitizes and normalizes path on Windows"""
740 if sys.platform == 'win32':
741 force = False
742 drive_or_unc, _ = os.path.splitdrive(s)
743 elif force:
744 drive_or_unc = ''
745 else:
746 return s
747
748 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
749 if drive_or_unc:
750 norm_path.pop(0)
751 sanitized_path = [
752 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
753 for path_part in norm_path]
754 if drive_or_unc:
755 sanitized_path.insert(0, drive_or_unc + os.path.sep)
756 elif force and s and s[0] == os.path.sep:
757 sanitized_path.insert(0, os.path.sep)
758 return os.path.join(*sanitized_path)
759
760
761 def sanitize_url(url, *, scheme='http'):
762 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
763 # the number of unwanted failures due to missing protocol
764 if url is None:
765 return
766 elif url.startswith('//'):
767 return f'{scheme}:{url}'
768 # Fix some common typos seen so far
769 COMMON_TYPOS = (
770 # https://github.com/ytdl-org/youtube-dl/issues/15649
771 (r'^httpss://', r'https://'),
772 # https://bx1.be/lives/direct-tv/
773 (r'^rmtp([es]?)://', r'rtmp\1://'),
774 )
775 for mistake, fixup in COMMON_TYPOS:
776 if re.match(mistake, url):
777 return re.sub(mistake, fixup, url)
778 return url
779
780
781 def extract_basic_auth(url):
782 parts = urllib.parse.urlsplit(url)
783 if parts.username is None:
784 return url, None
785 url = urllib.parse.urlunsplit(parts._replace(netloc=(
786 parts.hostname if parts.port is None
787 else '%s:%d' % (parts.hostname, parts.port))))
788 auth_payload = base64.b64encode(
789 ('%s:%s' % (parts.username, parts.password or '')).encode())
790 return url, f'Basic {auth_payload.decode()}'
791
792
793 def sanitized_Request(url, *args, **kwargs):
794 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
795 if auth_header is not None:
796 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
797 headers['Authorization'] = auth_header
798 return urllib.request.Request(url, *args, **kwargs)
799
800
801 def expand_path(s):
802 """Expand shell variables and ~"""
803 return os.path.expandvars(compat_expanduser(s))
804
805
806 def orderedSet(iterable, *, lazy=False):
807 """Remove all duplicates from the input iterable"""
808 def _iter():
809 seen = [] # Do not use set since the items can be unhashable
810 for x in iterable:
811 if x not in seen:
812 seen.append(x)
813 yield x
814
815 return _iter() if lazy else list(_iter())
816
817
818 def _htmlentity_transform(entity_with_semicolon):
819 """Transforms an HTML entity to a character."""
820 entity = entity_with_semicolon[:-1]
821
822 # Known non-numeric HTML entity
823 if entity in html.entities.name2codepoint:
824 return chr(html.entities.name2codepoint[entity])
825
826 # TODO: HTML5 allows entities without a semicolon.
827 # E.g. '&Eacuteric' should be decoded as 'Éric'.
828 if entity_with_semicolon in html.entities.html5:
829 return html.entities.html5[entity_with_semicolon]
830
831 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
832 if mobj is not None:
833 numstr = mobj.group(1)
834 if numstr.startswith('x'):
835 base = 16
836 numstr = '0%s' % numstr
837 else:
838 base = 10
839 # See https://github.com/ytdl-org/youtube-dl/issues/7518
840 with contextlib.suppress(ValueError):
841 return chr(int(numstr, base))
842
843 # Unknown entity in name, return its literal representation
844 return '&%s;' % entity
845
846
847 def unescapeHTML(s):
848 if s is None:
849 return None
850 assert isinstance(s, str)
851
852 return re.sub(
853 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
854
855
856 def escapeHTML(text):
857 return (
858 text
859 .replace('&', '&amp;')
860 .replace('<', '&lt;')
861 .replace('>', '&gt;')
862 .replace('"', '&quot;')
863 .replace("'", '&#39;')
864 )
865
866
867 def process_communicate_or_kill(p, *args, **kwargs):
868 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
869 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
870 return Popen.communicate_or_kill(p, *args, **kwargs)
871
872
873 class Popen(subprocess.Popen):
874 if sys.platform == 'win32':
875 _startupinfo = subprocess.STARTUPINFO()
876 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
877 else:
878 _startupinfo = None
879
880 @staticmethod
881 def _fix_pyinstaller_ld_path(env):
882 """Restore LD_LIBRARY_PATH when using PyInstaller
883 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
884 https://github.com/yt-dlp/yt-dlp/issues/4573
885 """
886 if not hasattr(sys, '_MEIPASS'):
887 return
888
889 def _fix(key):
890 orig = env.get(f'{key}_ORIG')
891 if orig is None:
892 env.pop(key, None)
893 else:
894 env[key] = orig
895
896 _fix('LD_LIBRARY_PATH') # Linux
897 _fix('DYLD_LIBRARY_PATH') # macOS
898
899 def __init__(self, *args, env=None, text=False, **kwargs):
900 if env is None:
901 env = os.environ.copy()
902 self._fix_pyinstaller_ld_path(env)
903
904 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
905 if text is True:
906 kwargs['universal_newlines'] = True # For 3.6 compatibility
907 kwargs.setdefault('encoding', 'utf-8')
908 kwargs.setdefault('errors', 'replace')
909 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
910
911 def communicate_or_kill(self, *args, **kwargs):
912 try:
913 return self.communicate(*args, **kwargs)
914 except BaseException: # Including KeyboardInterrupt
915 self.kill(timeout=None)
916 raise
917
918 def kill(self, *, timeout=0):
919 super().kill()
920 if timeout != 0:
921 self.wait(timeout=timeout)
922
923 @classmethod
924 def run(cls, *args, timeout=None, **kwargs):
925 with cls(*args, **kwargs) as proc:
926 default = '' if proc.__text_mode else b''
927 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
928 return stdout or default, stderr or default, proc.returncode
929
930
931 def get_subprocess_encoding():
932 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
933 # For subprocess calls, encode with locale encoding
934 # Refer to http://stackoverflow.com/a/9951851/35070
935 encoding = preferredencoding()
936 else:
937 encoding = sys.getfilesystemencoding()
938 if encoding is None:
939 encoding = 'utf-8'
940 return encoding
941
942
943 def encodeFilename(s, for_subprocess=False):
944 assert isinstance(s, str)
945 return s
946
947
948 def decodeFilename(b, for_subprocess=False):
949 return b
950
951
952 def encodeArgument(s):
953 # Legacy code that uses byte strings
954 # Uncomment the following line after fixing all post processors
955 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
956 return s if isinstance(s, str) else s.decode('ascii')
957
958
959 def decodeArgument(b):
960 return b
961
962
963 def decodeOption(optval):
964 if optval is None:
965 return optval
966 if isinstance(optval, bytes):
967 optval = optval.decode(preferredencoding())
968
969 assert isinstance(optval, str)
970 return optval
971
972
973 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
974
975
976 def timetuple_from_msec(msec):
977 secs, msec = divmod(msec, 1000)
978 mins, secs = divmod(secs, 60)
979 hrs, mins = divmod(mins, 60)
980 return _timetuple(hrs, mins, secs, msec)
981
982
983 def formatSeconds(secs, delim=':', msec=False):
984 time = timetuple_from_msec(secs * 1000)
985 if time.hours:
986 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
987 elif time.minutes:
988 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
989 else:
990 ret = '%d' % time.seconds
991 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
992
993
994 def _ssl_load_windows_store_certs(ssl_context, storename):
995 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
996 try:
997 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
998 if encoding == 'x509_asn' and (
999 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
1000 except PermissionError:
1001 return
1002 for cert in certs:
1003 with contextlib.suppress(ssl.SSLError):
1004 ssl_context.load_verify_locations(cadata=cert)
1005
1006
1007 def make_HTTPS_handler(params, **kwargs):
1008 opts_check_certificate = not params.get('nocheckcertificate')
1009 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1010 context.check_hostname = opts_check_certificate
1011 if params.get('legacyserverconnect'):
1012 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
1013 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
1014 context.set_ciphers('DEFAULT')
1015 elif (
1016 sys.version_info < (3, 10)
1017 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
1018 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
1019 ):
1020 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
1021 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1022 # in some situations [2][3].
1023 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1024 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
1025 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
1026 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1027 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1028 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1029 # 4. https://peps.python.org/pep-0644/
1030 # 5. https://peps.python.org/pep-0644/#libressl-support
1031 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1032 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1033 context.minimum_version = ssl.TLSVersion.TLSv1_2
1034
1035 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1036 if opts_check_certificate:
1037 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1038 context.load_verify_locations(cafile=certifi.where())
1039 else:
1040 try:
1041 context.load_default_certs()
1042 # Work around the issue in load_default_certs when there are bad certificates. See:
1043 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1044 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1045 except ssl.SSLError:
1046 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1047 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1048 for storename in ('CA', 'ROOT'):
1049 _ssl_load_windows_store_certs(context, storename)
1050 context.set_default_verify_paths()
1051
1052 client_certfile = params.get('client_certificate')
1053 if client_certfile:
1054 try:
1055 context.load_cert_chain(
1056 client_certfile, keyfile=params.get('client_certificate_key'),
1057 password=params.get('client_certificate_password'))
1058 except ssl.SSLError:
1059 raise YoutubeDLError('Unable to load client certificate')
1060
1061 # Some servers may reject requests if ALPN extension is not sent. See:
1062 # https://github.com/python/cpython/issues/85140
1063 # https://github.com/yt-dlp/yt-dlp/issues/3878
1064 with contextlib.suppress(NotImplementedError):
1065 context.set_alpn_protocols(['http/1.1'])
1066
1067 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1068
1069
1070 def bug_reports_message(before=';'):
1071 from .update import REPOSITORY
1072
1073 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1074 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1075
1076 before = before.rstrip()
1077 if not before or before.endswith(('.', '!', '?')):
1078 msg = msg[0].title() + msg[1:]
1079
1080 return (before + ' ' if before else '') + msg
1081
1082
1083 class YoutubeDLError(Exception):
1084 """Base exception for YoutubeDL errors."""
1085 msg = None
1086
1087 def __init__(self, msg=None):
1088 if msg is not None:
1089 self.msg = msg
1090 elif self.msg is None:
1091 self.msg = type(self).__name__
1092 super().__init__(self.msg)
1093
1094
1095 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1096 if hasattr(ssl, 'CertificateError'):
1097 network_exceptions.append(ssl.CertificateError)
1098 network_exceptions = tuple(network_exceptions)
1099
1100
1101 class ExtractorError(YoutubeDLError):
1102 """Error during info extraction."""
1103
1104 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1105 """ tb, if given, is the original traceback (so that it can be printed out).
1106 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1107 """
1108 if sys.exc_info()[0] in network_exceptions:
1109 expected = True
1110
1111 self.orig_msg = str(msg)
1112 self.traceback = tb
1113 self.expected = expected
1114 self.cause = cause
1115 self.video_id = video_id
1116 self.ie = ie
1117 self.exc_info = sys.exc_info() # preserve original exception
1118 if isinstance(self.exc_info[1], ExtractorError):
1119 self.exc_info = self.exc_info[1].exc_info
1120 super().__init__(self.__msg)
1121
1122 @property
1123 def __msg(self):
1124 return ''.join((
1125 format_field(self.ie, None, '[%s] '),
1126 format_field(self.video_id, None, '%s: '),
1127 self.orig_msg,
1128 format_field(self.cause, None, ' (caused by %r)'),
1129 '' if self.expected else bug_reports_message()))
1130
1131 def format_traceback(self):
1132 return join_nonempty(
1133 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1134 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1135 delim='\n') or None
1136
1137 def __setattr__(self, name, value):
1138 super().__setattr__(name, value)
1139 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1140 self.msg = self.__msg or type(self).__name__
1141 self.args = (self.msg, ) # Cannot be property
1142
1143
1144 class UnsupportedError(ExtractorError):
1145 def __init__(self, url):
1146 super().__init__(
1147 'Unsupported URL: %s' % url, expected=True)
1148 self.url = url
1149
1150
1151 class RegexNotFoundError(ExtractorError):
1152 """Error when a regex didn't match"""
1153 pass
1154
1155
1156 class GeoRestrictedError(ExtractorError):
1157 """Geographic restriction Error exception.
1158
1159 This exception may be thrown when a video is not available from your
1160 geographic location due to geographic restrictions imposed by a website.
1161 """
1162
1163 def __init__(self, msg, countries=None, **kwargs):
1164 kwargs['expected'] = True
1165 super().__init__(msg, **kwargs)
1166 self.countries = countries
1167
1168
1169 class UserNotLive(ExtractorError):
1170 """Error when a channel/user is not live"""
1171
1172 def __init__(self, msg=None, **kwargs):
1173 kwargs['expected'] = True
1174 super().__init__(msg or 'The channel is not currently live', **kwargs)
1175
1176
1177 class DownloadError(YoutubeDLError):
1178 """Download Error exception.
1179
1180 This exception may be thrown by FileDownloader objects if they are not
1181 configured to continue on errors. They will contain the appropriate
1182 error message.
1183 """
1184
1185 def __init__(self, msg, exc_info=None):
1186 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1187 super().__init__(msg)
1188 self.exc_info = exc_info
1189
1190
1191 class EntryNotInPlaylist(YoutubeDLError):
1192 """Entry not in playlist exception.
1193
1194 This exception will be thrown by YoutubeDL when a requested entry
1195 is not found in the playlist info_dict
1196 """
1197 msg = 'Entry not found in info'
1198
1199
1200 class SameFileError(YoutubeDLError):
1201 """Same File exception.
1202
1203 This exception will be thrown by FileDownloader objects if they detect
1204 multiple files would have to be downloaded to the same file on disk.
1205 """
1206 msg = 'Fixed output name but more than one file to download'
1207
1208 def __init__(self, filename=None):
1209 if filename is not None:
1210 self.msg += f': {filename}'
1211 super().__init__(self.msg)
1212
1213
1214 class PostProcessingError(YoutubeDLError):
1215 """Post Processing exception.
1216
1217 This exception may be raised by PostProcessor's .run() method to
1218 indicate an error in the postprocessing task.
1219 """
1220
1221
1222 class DownloadCancelled(YoutubeDLError):
1223 """ Exception raised when the download queue should be interrupted """
1224 msg = 'The download was cancelled'
1225
1226
1227 class ExistingVideoReached(DownloadCancelled):
1228 """ --break-on-existing triggered """
1229 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1230
1231
1232 class RejectedVideoReached(DownloadCancelled):
1233 """ --break-match-filter triggered """
1234 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1235
1236
1237 class MaxDownloadsReached(DownloadCancelled):
1238 """ --max-downloads limit has been reached. """
1239 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1240
1241
1242 class ReExtractInfo(YoutubeDLError):
1243 """ Video info needs to be re-extracted. """
1244
1245 def __init__(self, msg, expected=False):
1246 super().__init__(msg)
1247 self.expected = expected
1248
1249
1250 class ThrottledDownload(ReExtractInfo):
1251 """ Download speed below --throttled-rate. """
1252 msg = 'The download speed is below throttle limit'
1253
1254 def __init__(self):
1255 super().__init__(self.msg, expected=False)
1256
1257
1258 class UnavailableVideoError(YoutubeDLError):
1259 """Unavailable Format exception.
1260
1261 This exception will be thrown when a video is requested
1262 in a format that is not available for that video.
1263 """
1264 msg = 'Unable to download video'
1265
1266 def __init__(self, err=None):
1267 if err is not None:
1268 self.msg += f': {err}'
1269 super().__init__(self.msg)
1270
1271
1272 class ContentTooShortError(YoutubeDLError):
1273 """Content Too Short exception.
1274
1275 This exception may be raised by FileDownloader objects when a file they
1276 download is too small for what the server announced first, indicating
1277 the connection was probably interrupted.
1278 """
1279
1280 def __init__(self, downloaded, expected):
1281 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1282 # Both in bytes
1283 self.downloaded = downloaded
1284 self.expected = expected
1285
1286
1287 class XAttrMetadataError(YoutubeDLError):
1288 def __init__(self, code=None, msg='Unknown error'):
1289 super().__init__(msg)
1290 self.code = code
1291 self.msg = msg
1292
1293 # Parsing code and msg
1294 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1295 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1296 self.reason = 'NO_SPACE'
1297 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1298 self.reason = 'VALUE_TOO_LONG'
1299 else:
1300 self.reason = 'NOT_SUPPORTED'
1301
1302
1303 class XAttrUnavailableError(YoutubeDLError):
1304 pass
1305
1306
1307 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1308 hc = http_class(*args, **kwargs)
1309 source_address = ydl_handler._params.get('source_address')
1310
1311 if source_address is not None:
1312 # This is to workaround _create_connection() from socket where it will try all
1313 # address data from getaddrinfo() including IPv6. This filters the result from
1314 # getaddrinfo() based on the source_address value.
1315 # This is based on the cpython socket.create_connection() function.
1316 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1317 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1318 host, port = address
1319 err = None
1320 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1321 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1322 ip_addrs = [addr for addr in addrs if addr[0] == af]
1323 if addrs and not ip_addrs:
1324 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1325 raise OSError(
1326 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1327 % (ip_version, source_address[0]))
1328 for res in ip_addrs:
1329 af, socktype, proto, canonname, sa = res
1330 sock = None
1331 try:
1332 sock = socket.socket(af, socktype, proto)
1333 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1334 sock.settimeout(timeout)
1335 sock.bind(source_address)
1336 sock.connect(sa)
1337 err = None # Explicitly break reference cycle
1338 return sock
1339 except OSError as _:
1340 err = _
1341 if sock is not None:
1342 sock.close()
1343 if err is not None:
1344 raise err
1345 else:
1346 raise OSError('getaddrinfo returns an empty list')
1347 if hasattr(hc, '_create_connection'):
1348 hc._create_connection = _create_connection
1349 hc.source_address = (source_address, 0)
1350
1351 return hc
1352
1353
1354 def handle_youtubedl_headers(headers):
1355 filtered_headers = headers
1356
1357 if 'Youtubedl-no-compression' in filtered_headers:
1358 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1359 del filtered_headers['Youtubedl-no-compression']
1360
1361 return filtered_headers
1362
1363
1364 class YoutubeDLHandler(urllib.request.HTTPHandler):
1365 """Handler for HTTP requests and responses.
1366
1367 This class, when installed with an OpenerDirector, automatically adds
1368 the standard headers to every HTTP request and handles gzipped and
1369 deflated responses from web servers. If compression is to be avoided in
1370 a particular request, the original request in the program code only has
1371 to include the HTTP header "Youtubedl-no-compression", which will be
1372 removed before making the real request.
1373
1374 Part of this code was copied from:
1375
1376 http://techknack.net/python-urllib2-handlers/
1377
1378 Andrew Rowls, the author of that code, agreed to release it to the
1379 public domain.
1380 """
1381
1382 def __init__(self, params, *args, **kwargs):
1383 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1384 self._params = params
1385
1386 def http_open(self, req):
1387 conn_class = http.client.HTTPConnection
1388
1389 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1390 if socks_proxy:
1391 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1392 del req.headers['Ytdl-socks-proxy']
1393
1394 return self.do_open(functools.partial(
1395 _create_http_connection, self, conn_class, False),
1396 req)
1397
1398 @staticmethod
1399 def deflate(data):
1400 if not data:
1401 return data
1402 try:
1403 return zlib.decompress(data, -zlib.MAX_WBITS)
1404 except zlib.error:
1405 return zlib.decompress(data)
1406
1407 @staticmethod
1408 def brotli(data):
1409 if not data:
1410 return data
1411 return brotli.decompress(data)
1412
1413 def http_request(self, req):
1414 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1415 # always respected by websites, some tend to give out URLs with non percent-encoded
1416 # non-ASCII characters (see telemb.py, ard.py [#3412])
1417 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1418 # To work around aforementioned issue we will replace request's original URL with
1419 # percent-encoded one
1420 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1421 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1422 url = req.get_full_url()
1423 url_escaped = escape_url(url)
1424
1425 # Substitute URL if any change after escaping
1426 if url != url_escaped:
1427 req = update_Request(req, url=url_escaped)
1428
1429 for h, v in self._params.get('http_headers', std_headers).items():
1430 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1431 # The dict keys are capitalized because of this bug by urllib
1432 if h.capitalize() not in req.headers:
1433 req.add_header(h, v)
1434
1435 if 'Accept-encoding' not in req.headers:
1436 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1437
1438 req.headers = handle_youtubedl_headers(req.headers)
1439
1440 return super().do_request_(req)
1441
1442 def http_response(self, req, resp):
1443 old_resp = resp
1444 # gzip
1445 if resp.headers.get('Content-encoding', '') == 'gzip':
1446 content = resp.read()
1447 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1448 try:
1449 uncompressed = io.BytesIO(gz.read())
1450 except OSError as original_ioerror:
1451 # There may be junk add the end of the file
1452 # See http://stackoverflow.com/q/4928560/35070 for details
1453 for i in range(1, 1024):
1454 try:
1455 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1456 uncompressed = io.BytesIO(gz.read())
1457 except OSError:
1458 continue
1459 break
1460 else:
1461 raise original_ioerror
1462 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1463 resp.msg = old_resp.msg
1464 # deflate
1465 if resp.headers.get('Content-encoding', '') == 'deflate':
1466 gz = io.BytesIO(self.deflate(resp.read()))
1467 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1468 resp.msg = old_resp.msg
1469 # brotli
1470 if resp.headers.get('Content-encoding', '') == 'br':
1471 resp = urllib.request.addinfourl(
1472 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1473 resp.msg = old_resp.msg
1474 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1475 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1476 if 300 <= resp.code < 400:
1477 location = resp.headers.get('Location')
1478 if location:
1479 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1480 location = location.encode('iso-8859-1').decode()
1481 location_escaped = escape_url(location)
1482 if location != location_escaped:
1483 del resp.headers['Location']
1484 resp.headers['Location'] = location_escaped
1485 return resp
1486
1487 https_request = http_request
1488 https_response = http_response
1489
1490
1491 def make_socks_conn_class(base_class, socks_proxy):
1492 assert issubclass(base_class, (
1493 http.client.HTTPConnection, http.client.HTTPSConnection))
1494
1495 url_components = urllib.parse.urlparse(socks_proxy)
1496 if url_components.scheme.lower() == 'socks5':
1497 socks_type = ProxyType.SOCKS5
1498 elif url_components.scheme.lower() in ('socks', 'socks4'):
1499 socks_type = ProxyType.SOCKS4
1500 elif url_components.scheme.lower() == 'socks4a':
1501 socks_type = ProxyType.SOCKS4A
1502
1503 def unquote_if_non_empty(s):
1504 if not s:
1505 return s
1506 return urllib.parse.unquote_plus(s)
1507
1508 proxy_args = (
1509 socks_type,
1510 url_components.hostname, url_components.port or 1080,
1511 True, # Remote DNS
1512 unquote_if_non_empty(url_components.username),
1513 unquote_if_non_empty(url_components.password),
1514 )
1515
1516 class SocksConnection(base_class):
1517 def connect(self):
1518 self.sock = sockssocket()
1519 self.sock.setproxy(*proxy_args)
1520 if isinstance(self.timeout, (int, float)):
1521 self.sock.settimeout(self.timeout)
1522 self.sock.connect((self.host, self.port))
1523
1524 if isinstance(self, http.client.HTTPSConnection):
1525 if hasattr(self, '_context'): # Python > 2.6
1526 self.sock = self._context.wrap_socket(
1527 self.sock, server_hostname=self.host)
1528 else:
1529 self.sock = ssl.wrap_socket(self.sock)
1530
1531 return SocksConnection
1532
1533
1534 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1535 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1536 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1537 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1538 self._params = params
1539
1540 def https_open(self, req):
1541 kwargs = {}
1542 conn_class = self._https_conn_class
1543
1544 if hasattr(self, '_context'): # python > 2.6
1545 kwargs['context'] = self._context
1546 if hasattr(self, '_check_hostname'): # python 3.x
1547 kwargs['check_hostname'] = self._check_hostname
1548
1549 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1550 if socks_proxy:
1551 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1552 del req.headers['Ytdl-socks-proxy']
1553
1554 try:
1555 return self.do_open(
1556 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1557 except urllib.error.URLError as e:
1558 if (isinstance(e.reason, ssl.SSLError)
1559 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1560 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1561 raise
1562
1563
1564 def is_path_like(f):
1565 return isinstance(f, (str, bytes, os.PathLike))
1566
1567
1568 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1569 """
1570 See [1] for cookie file format.
1571
1572 1. https://curl.haxx.se/docs/http-cookies.html
1573 """
1574 _HTTPONLY_PREFIX = '#HttpOnly_'
1575 _ENTRY_LEN = 7
1576 _HEADER = '''# Netscape HTTP Cookie File
1577 # This file is generated by yt-dlp. Do not edit.
1578
1579 '''
1580 _CookieFileEntry = collections.namedtuple(
1581 'CookieFileEntry',
1582 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1583
1584 def __init__(self, filename=None, *args, **kwargs):
1585 super().__init__(None, *args, **kwargs)
1586 if is_path_like(filename):
1587 filename = os.fspath(filename)
1588 self.filename = filename
1589
1590 @staticmethod
1591 def _true_or_false(cndn):
1592 return 'TRUE' if cndn else 'FALSE'
1593
1594 @contextlib.contextmanager
1595 def open(self, file, *, write=False):
1596 if is_path_like(file):
1597 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1598 yield f
1599 else:
1600 if write:
1601 file.truncate(0)
1602 yield file
1603
1604 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1605 now = time.time()
1606 for cookie in self:
1607 if (not ignore_discard and cookie.discard
1608 or not ignore_expires and cookie.is_expired(now)):
1609 continue
1610 name, value = cookie.name, cookie.value
1611 if value is None:
1612 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1613 # with no name, whereas http.cookiejar regards it as a
1614 # cookie with no value.
1615 name, value = '', name
1616 f.write('%s\n' % '\t'.join((
1617 cookie.domain,
1618 self._true_or_false(cookie.domain.startswith('.')),
1619 cookie.path,
1620 self._true_or_false(cookie.secure),
1621 str_or_none(cookie.expires, default=''),
1622 name, value
1623 )))
1624
1625 def save(self, filename=None, *args, **kwargs):
1626 """
1627 Save cookies to a file.
1628 Code is taken from CPython 3.6
1629 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1630
1631 if filename is None:
1632 if self.filename is not None:
1633 filename = self.filename
1634 else:
1635 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1636
1637 # Store session cookies with `expires` set to 0 instead of an empty string
1638 for cookie in self:
1639 if cookie.expires is None:
1640 cookie.expires = 0
1641
1642 with self.open(filename, write=True) as f:
1643 f.write(self._HEADER)
1644 self._really_save(f, *args, **kwargs)
1645
1646 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1647 """Load cookies from a file."""
1648 if filename is None:
1649 if self.filename is not None:
1650 filename = self.filename
1651 else:
1652 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1653
1654 def prepare_line(line):
1655 if line.startswith(self._HTTPONLY_PREFIX):
1656 line = line[len(self._HTTPONLY_PREFIX):]
1657 # comments and empty lines are fine
1658 if line.startswith('#') or not line.strip():
1659 return line
1660 cookie_list = line.split('\t')
1661 if len(cookie_list) != self._ENTRY_LEN:
1662 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1663 cookie = self._CookieFileEntry(*cookie_list)
1664 if cookie.expires_at and not cookie.expires_at.isdigit():
1665 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1666 return line
1667
1668 cf = io.StringIO()
1669 with self.open(filename) as f:
1670 for line in f:
1671 try:
1672 cf.write(prepare_line(line))
1673 except http.cookiejar.LoadError as e:
1674 if f'{line.strip()} '[0] in '[{"':
1675 raise http.cookiejar.LoadError(
1676 'Cookies file must be Netscape formatted, not JSON. See '
1677 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1678 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1679 continue
1680 cf.seek(0)
1681 self._really_load(cf, filename, ignore_discard, ignore_expires)
1682 # Session cookies are denoted by either `expires` field set to
1683 # an empty string or 0. MozillaCookieJar only recognizes the former
1684 # (see [1]). So we need force the latter to be recognized as session
1685 # cookies on our own.
1686 # Session cookies may be important for cookies-based authentication,
1687 # e.g. usually, when user does not check 'Remember me' check box while
1688 # logging in on a site, some important cookies are stored as session
1689 # cookies so that not recognizing them will result in failed login.
1690 # 1. https://bugs.python.org/issue17164
1691 for cookie in self:
1692 # Treat `expires=0` cookies as session cookies
1693 if cookie.expires == 0:
1694 cookie.expires = None
1695 cookie.discard = True
1696
1697
1698 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1699 def __init__(self, cookiejar=None):
1700 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1701
1702 def http_response(self, request, response):
1703 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1704
1705 https_request = urllib.request.HTTPCookieProcessor.http_request
1706 https_response = http_response
1707
1708
1709 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1710 """YoutubeDL redirect handler
1711
1712 The code is based on HTTPRedirectHandler implementation from CPython [1].
1713
1714 This redirect handler solves two issues:
1715 - ensures redirect URL is always unicode under python 2
1716 - introduces support for experimental HTTP response status code
1717 308 Permanent Redirect [2] used by some sites [3]
1718
1719 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1720 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1721 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1722 """
1723
1724 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1725
1726 def redirect_request(self, req, fp, code, msg, headers, newurl):
1727 """Return a Request or None in response to a redirect.
1728
1729 This is called by the http_error_30x methods when a
1730 redirection response is received. If a redirection should
1731 take place, return a new Request to allow http_error_30x to
1732 perform the redirect. Otherwise, raise HTTPError if no-one
1733 else should try to handle this url. Return None if you can't
1734 but another Handler might.
1735 """
1736 m = req.get_method()
1737 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1738 or code in (301, 302, 303) and m == "POST")):
1739 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1740 # Strictly (according to RFC 2616), 301 or 302 in response to
1741 # a POST MUST NOT cause a redirection without confirmation
1742 # from the user (of urllib.request, in this case). In practice,
1743 # essentially all clients do redirect in this case, so we do
1744 # the same.
1745
1746 # Be conciliant with URIs containing a space. This is mainly
1747 # redundant with the more complete encoding done in http_error_302(),
1748 # but it is kept for compatibility with other callers.
1749 newurl = newurl.replace(' ', '%20')
1750
1751 CONTENT_HEADERS = ("content-length", "content-type")
1752 # NB: don't use dict comprehension for python 2.6 compatibility
1753 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1754
1755 # A 303 must either use GET or HEAD for subsequent request
1756 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1757 if code == 303 and m != 'HEAD':
1758 m = 'GET'
1759 # 301 and 302 redirects are commonly turned into a GET from a POST
1760 # for subsequent requests by browsers, so we'll do the same.
1761 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1762 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1763 if code in (301, 302) and m == 'POST':
1764 m = 'GET'
1765
1766 return urllib.request.Request(
1767 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1768 unverifiable=True, method=m)
1769
1770
1771 def extract_timezone(date_str):
1772 m = re.search(
1773 r'''(?x)
1774 ^.{8,}? # >=8 char non-TZ prefix, if present
1775 (?P<tz>Z| # just the UTC Z, or
1776 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1777 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1778 [ ]? # optional space
1779 (?P<sign>\+|-) # +/-
1780 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1781 $)
1782 ''', date_str)
1783 if not m:
1784 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1785 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1786 if timezone is not None:
1787 date_str = date_str[:-len(m.group('tz'))]
1788 timezone = datetime.timedelta(hours=timezone or 0)
1789 else:
1790 date_str = date_str[:-len(m.group('tz'))]
1791 if not m.group('sign'):
1792 timezone = datetime.timedelta()
1793 else:
1794 sign = 1 if m.group('sign') == '+' else -1
1795 timezone = datetime.timedelta(
1796 hours=sign * int(m.group('hours')),
1797 minutes=sign * int(m.group('minutes')))
1798 return timezone, date_str
1799
1800
1801 def parse_iso8601(date_str, delimiter='T', timezone=None):
1802 """ Return a UNIX timestamp from the given date """
1803
1804 if date_str is None:
1805 return None
1806
1807 date_str = re.sub(r'\.[0-9]+', '', date_str)
1808
1809 if timezone is None:
1810 timezone, date_str = extract_timezone(date_str)
1811
1812 with contextlib.suppress(ValueError):
1813 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1814 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1815 return calendar.timegm(dt.timetuple())
1816
1817
1818 def date_formats(day_first=True):
1819 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1820
1821
1822 def unified_strdate(date_str, day_first=True):
1823 """Return a string with the date in the format YYYYMMDD"""
1824
1825 if date_str is None:
1826 return None
1827 upload_date = None
1828 # Replace commas
1829 date_str = date_str.replace(',', ' ')
1830 # Remove AM/PM + timezone
1831 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1832 _, date_str = extract_timezone(date_str)
1833
1834 for expression in date_formats(day_first):
1835 with contextlib.suppress(ValueError):
1836 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1837 if upload_date is None:
1838 timetuple = email.utils.parsedate_tz(date_str)
1839 if timetuple:
1840 with contextlib.suppress(ValueError):
1841 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1842 if upload_date is not None:
1843 return str(upload_date)
1844
1845
1846 def unified_timestamp(date_str, day_first=True):
1847 if date_str is None:
1848 return None
1849
1850 date_str = re.sub(r'\s+', ' ', re.sub(
1851 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1852
1853 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1854 timezone, date_str = extract_timezone(date_str)
1855
1856 # Remove AM/PM + timezone
1857 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1858
1859 # Remove unrecognized timezones from ISO 8601 alike timestamps
1860 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1861 if m:
1862 date_str = date_str[:-len(m.group('tz'))]
1863
1864 # Python only supports microseconds, so remove nanoseconds
1865 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1866 if m:
1867 date_str = m.group(1)
1868
1869 for expression in date_formats(day_first):
1870 with contextlib.suppress(ValueError):
1871 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1872 return calendar.timegm(dt.timetuple())
1873
1874 timetuple = email.utils.parsedate_tz(date_str)
1875 if timetuple:
1876 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1877
1878
1879 def determine_ext(url, default_ext='unknown_video'):
1880 if url is None or '.' not in url:
1881 return default_ext
1882 guess = url.partition('?')[0].rpartition('.')[2]
1883 if re.match(r'^[A-Za-z0-9]+$', guess):
1884 return guess
1885 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1886 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1887 return guess.rstrip('/')
1888 else:
1889 return default_ext
1890
1891
1892 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1893 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1894
1895
1896 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1897 R"""
1898 Return a datetime object from a string.
1899 Supported format:
1900 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1901
1902 @param format strftime format of DATE
1903 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1904 auto: round to the unit provided in date_str (if applicable).
1905 """
1906 auto_precision = False
1907 if precision == 'auto':
1908 auto_precision = True
1909 precision = 'microsecond'
1910 today = datetime_round(datetime.datetime.utcnow(), precision)
1911 if date_str in ('now', 'today'):
1912 return today
1913 if date_str == 'yesterday':
1914 return today - datetime.timedelta(days=1)
1915 match = re.match(
1916 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1917 date_str)
1918 if match is not None:
1919 start_time = datetime_from_str(match.group('start'), precision, format)
1920 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1921 unit = match.group('unit')
1922 if unit == 'month' or unit == 'year':
1923 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1924 unit = 'day'
1925 else:
1926 if unit == 'week':
1927 unit = 'day'
1928 time *= 7
1929 delta = datetime.timedelta(**{unit + 's': time})
1930 new_date = start_time + delta
1931 if auto_precision:
1932 return datetime_round(new_date, unit)
1933 return new_date
1934
1935 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1936
1937
1938 def date_from_str(date_str, format='%Y%m%d', strict=False):
1939 R"""
1940 Return a date object from a string using datetime_from_str
1941
1942 @param strict Restrict allowed patterns to "YYYYMMDD" and
1943 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1944 """
1945 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1946 raise ValueError(f'Invalid date format "{date_str}"')
1947 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1948
1949
1950 def datetime_add_months(dt, months):
1951 """Increment/Decrement a datetime object by months."""
1952 month = dt.month + months - 1
1953 year = dt.year + month // 12
1954 month = month % 12 + 1
1955 day = min(dt.day, calendar.monthrange(year, month)[1])
1956 return dt.replace(year, month, day)
1957
1958
1959 def datetime_round(dt, precision='day'):
1960 """
1961 Round a datetime object's time to a specific precision
1962 """
1963 if precision == 'microsecond':
1964 return dt
1965
1966 unit_seconds = {
1967 'day': 86400,
1968 'hour': 3600,
1969 'minute': 60,
1970 'second': 1,
1971 }
1972 roundto = lambda x, n: ((x + n / 2) // n) * n
1973 timestamp = calendar.timegm(dt.timetuple())
1974 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1975
1976
1977 def hyphenate_date(date_str):
1978 """
1979 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1980 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1981 if match is not None:
1982 return '-'.join(match.groups())
1983 else:
1984 return date_str
1985
1986
1987 class DateRange:
1988 """Represents a time interval between two dates"""
1989
1990 def __init__(self, start=None, end=None):
1991 """start and end must be strings in the format accepted by date"""
1992 if start is not None:
1993 self.start = date_from_str(start, strict=True)
1994 else:
1995 self.start = datetime.datetime.min.date()
1996 if end is not None:
1997 self.end = date_from_str(end, strict=True)
1998 else:
1999 self.end = datetime.datetime.max.date()
2000 if self.start > self.end:
2001 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
2002
2003 @classmethod
2004 def day(cls, day):
2005 """Returns a range that only contains the given day"""
2006 return cls(day, day)
2007
2008 def __contains__(self, date):
2009 """Check if the date is in the range"""
2010 if not isinstance(date, datetime.date):
2011 date = date_from_str(date)
2012 return self.start <= date <= self.end
2013
2014 def __str__(self):
2015 return f'{self.start.isoformat()} - {self.end.isoformat()}'
2016
2017 def __eq__(self, other):
2018 return (isinstance(other, DateRange)
2019 and self.start == other.start and self.end == other.end)
2020
2021
2022 def platform_name():
2023 """ Returns the platform name as a str """
2024 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
2025 return platform.platform()
2026
2027
2028 @functools.cache
2029 def system_identifier():
2030 python_implementation = platform.python_implementation()
2031 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2032 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
2033 libc_ver = []
2034 with contextlib.suppress(OSError): # We may not have access to the executable
2035 libc_ver = platform.libc_ver()
2036
2037 return 'Python %s (%s %s %s) - %s (%s%s)' % (
2038 platform.python_version(),
2039 python_implementation,
2040 platform.machine(),
2041 platform.architecture()[0],
2042 platform.platform(),
2043 ssl.OPENSSL_VERSION,
2044 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
2045 )
2046
2047
2048 @functools.cache
2049 def get_windows_version():
2050 ''' Get Windows version. returns () if it's not running on Windows '''
2051 if compat_os_name == 'nt':
2052 return version_tuple(platform.win32_ver()[1])
2053 else:
2054 return ()
2055
2056
2057 def write_string(s, out=None, encoding=None):
2058 assert isinstance(s, str)
2059 out = out or sys.stderr
2060
2061 if compat_os_name == 'nt' and supports_terminal_sequences(out):
2062 s = re.sub(r'([\r\n]+)', r' \1', s)
2063
2064 enc, buffer = None, out
2065 if 'b' in getattr(out, 'mode', ''):
2066 enc = encoding or preferredencoding()
2067 elif hasattr(out, 'buffer'):
2068 buffer = out.buffer
2069 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2070
2071 buffer.write(s.encode(enc, 'ignore') if enc else s)
2072 out.flush()
2073
2074
2075 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2076 from . import _IN_CLI
2077 if _IN_CLI:
2078 if msg in deprecation_warning._cache:
2079 return
2080 deprecation_warning._cache.add(msg)
2081 if printer:
2082 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2083 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2084 else:
2085 import warnings
2086 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2087
2088
2089 deprecation_warning._cache = set()
2090
2091
2092 def bytes_to_intlist(bs):
2093 if not bs:
2094 return []
2095 if isinstance(bs[0], int): # Python 3
2096 return list(bs)
2097 else:
2098 return [ord(c) for c in bs]
2099
2100
2101 def intlist_to_bytes(xs):
2102 if not xs:
2103 return b''
2104 return struct.pack('%dB' % len(xs), *xs)
2105
2106
2107 class LockingUnsupportedError(OSError):
2108 msg = 'File locking is not supported'
2109
2110 def __init__(self):
2111 super().__init__(self.msg)
2112
2113
2114 # Cross-platform file locking
2115 if sys.platform == 'win32':
2116 import ctypes
2117 import ctypes.wintypes
2118 import msvcrt
2119
2120 class OVERLAPPED(ctypes.Structure):
2121 _fields_ = [
2122 ('Internal', ctypes.wintypes.LPVOID),
2123 ('InternalHigh', ctypes.wintypes.LPVOID),
2124 ('Offset', ctypes.wintypes.DWORD),
2125 ('OffsetHigh', ctypes.wintypes.DWORD),
2126 ('hEvent', ctypes.wintypes.HANDLE),
2127 ]
2128
2129 kernel32 = ctypes.WinDLL('kernel32')
2130 LockFileEx = kernel32.LockFileEx
2131 LockFileEx.argtypes = [
2132 ctypes.wintypes.HANDLE, # hFile
2133 ctypes.wintypes.DWORD, # dwFlags
2134 ctypes.wintypes.DWORD, # dwReserved
2135 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2136 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2137 ctypes.POINTER(OVERLAPPED) # Overlapped
2138 ]
2139 LockFileEx.restype = ctypes.wintypes.BOOL
2140 UnlockFileEx = kernel32.UnlockFileEx
2141 UnlockFileEx.argtypes = [
2142 ctypes.wintypes.HANDLE, # hFile
2143 ctypes.wintypes.DWORD, # dwReserved
2144 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2145 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2146 ctypes.POINTER(OVERLAPPED) # Overlapped
2147 ]
2148 UnlockFileEx.restype = ctypes.wintypes.BOOL
2149 whole_low = 0xffffffff
2150 whole_high = 0x7fffffff
2151
2152 def _lock_file(f, exclusive, block):
2153 overlapped = OVERLAPPED()
2154 overlapped.Offset = 0
2155 overlapped.OffsetHigh = 0
2156 overlapped.hEvent = 0
2157 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2158
2159 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2160 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2161 0, whole_low, whole_high, f._lock_file_overlapped_p):
2162 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2163 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2164
2165 def _unlock_file(f):
2166 assert f._lock_file_overlapped_p
2167 handle = msvcrt.get_osfhandle(f.fileno())
2168 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2169 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2170
2171 else:
2172 try:
2173 import fcntl
2174
2175 def _lock_file(f, exclusive, block):
2176 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2177 if not block:
2178 flags |= fcntl.LOCK_NB
2179 try:
2180 fcntl.flock(f, flags)
2181 except BlockingIOError:
2182 raise
2183 except OSError: # AOSP does not have flock()
2184 fcntl.lockf(f, flags)
2185
2186 def _unlock_file(f):
2187 try:
2188 fcntl.flock(f, fcntl.LOCK_UN)
2189 except OSError:
2190 fcntl.lockf(f, fcntl.LOCK_UN)
2191
2192 except ImportError:
2193
2194 def _lock_file(f, exclusive, block):
2195 raise LockingUnsupportedError()
2196
2197 def _unlock_file(f):
2198 raise LockingUnsupportedError()
2199
2200
2201 class locked_file:
2202 locked = False
2203
2204 def __init__(self, filename, mode, block=True, encoding=None):
2205 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2206 raise NotImplementedError(mode)
2207 self.mode, self.block = mode, block
2208
2209 writable = any(f in mode for f in 'wax+')
2210 readable = any(f in mode for f in 'r+')
2211 flags = functools.reduce(operator.ior, (
2212 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2213 getattr(os, 'O_BINARY', 0), # Windows only
2214 getattr(os, 'O_NOINHERIT', 0), # Windows only
2215 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2216 os.O_APPEND if 'a' in mode else 0,
2217 os.O_EXCL if 'x' in mode else 0,
2218 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2219 ))
2220
2221 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2222
2223 def __enter__(self):
2224 exclusive = 'r' not in self.mode
2225 try:
2226 _lock_file(self.f, exclusive, self.block)
2227 self.locked = True
2228 except OSError:
2229 self.f.close()
2230 raise
2231 if 'w' in self.mode:
2232 try:
2233 self.f.truncate()
2234 except OSError as e:
2235 if e.errno not in (
2236 errno.ESPIPE, # Illegal seek - expected for FIFO
2237 errno.EINVAL, # Invalid argument - expected for /dev/null
2238 ):
2239 raise
2240 return self
2241
2242 def unlock(self):
2243 if not self.locked:
2244 return
2245 try:
2246 _unlock_file(self.f)
2247 finally:
2248 self.locked = False
2249
2250 def __exit__(self, *_):
2251 try:
2252 self.unlock()
2253 finally:
2254 self.f.close()
2255
2256 open = __enter__
2257 close = __exit__
2258
2259 def __getattr__(self, attr):
2260 return getattr(self.f, attr)
2261
2262 def __iter__(self):
2263 return iter(self.f)
2264
2265
2266 @functools.cache
2267 def get_filesystem_encoding():
2268 encoding = sys.getfilesystemencoding()
2269 return encoding if encoding is not None else 'utf-8'
2270
2271
2272 def shell_quote(args):
2273 quoted_args = []
2274 encoding = get_filesystem_encoding()
2275 for a in args:
2276 if isinstance(a, bytes):
2277 # We may get a filename encoded with 'encodeFilename'
2278 a = a.decode(encoding)
2279 quoted_args.append(compat_shlex_quote(a))
2280 return ' '.join(quoted_args)
2281
2282
2283 def smuggle_url(url, data):
2284 """ Pass additional data in a URL for internal use. """
2285
2286 url, idata = unsmuggle_url(url, {})
2287 data.update(idata)
2288 sdata = urllib.parse.urlencode(
2289 {'__youtubedl_smuggle': json.dumps(data)})
2290 return url + '#' + sdata
2291
2292
2293 def unsmuggle_url(smug_url, default=None):
2294 if '#__youtubedl_smuggle' not in smug_url:
2295 return smug_url, default
2296 url, _, sdata = smug_url.rpartition('#')
2297 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2298 data = json.loads(jsond)
2299 return url, data
2300
2301
2302 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2303 """ Formats numbers with decimal sufixes like K, M, etc """
2304 num, factor = float_or_none(num), float(factor)
2305 if num is None or num < 0:
2306 return None
2307 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2308 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2309 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2310 if factor == 1024:
2311 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2312 converted = num / (factor ** exponent)
2313 return fmt % (converted, suffix)
2314
2315
2316 def format_bytes(bytes):
2317 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2318
2319
2320 def lookup_unit_table(unit_table, s, strict=False):
2321 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2322 units_re = '|'.join(re.escape(u) for u in unit_table)
2323 m = (re.fullmatch if strict else re.match)(
2324 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2325 if not m:
2326 return None
2327
2328 num = float(m.group('num').replace(',', '.'))
2329 mult = unit_table[m.group('unit')]
2330 return round(num * mult)
2331
2332
2333 def parse_bytes(s):
2334 """Parse a string indicating a byte quantity into an integer"""
2335 return lookup_unit_table(
2336 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2337 s.upper(), strict=True)
2338
2339
2340 def parse_filesize(s):
2341 if s is None:
2342 return None
2343
2344 # The lower-case forms are of course incorrect and unofficial,
2345 # but we support those too
2346 _UNIT_TABLE = {
2347 'B': 1,
2348 'b': 1,
2349 'bytes': 1,
2350 'KiB': 1024,
2351 'KB': 1000,
2352 'kB': 1024,
2353 'Kb': 1000,
2354 'kb': 1000,
2355 'kilobytes': 1000,
2356 'kibibytes': 1024,
2357 'MiB': 1024 ** 2,
2358 'MB': 1000 ** 2,
2359 'mB': 1024 ** 2,
2360 'Mb': 1000 ** 2,
2361 'mb': 1000 ** 2,
2362 'megabytes': 1000 ** 2,
2363 'mebibytes': 1024 ** 2,
2364 'GiB': 1024 ** 3,
2365 'GB': 1000 ** 3,
2366 'gB': 1024 ** 3,
2367 'Gb': 1000 ** 3,
2368 'gb': 1000 ** 3,
2369 'gigabytes': 1000 ** 3,
2370 'gibibytes': 1024 ** 3,
2371 'TiB': 1024 ** 4,
2372 'TB': 1000 ** 4,
2373 'tB': 1024 ** 4,
2374 'Tb': 1000 ** 4,
2375 'tb': 1000 ** 4,
2376 'terabytes': 1000 ** 4,
2377 'tebibytes': 1024 ** 4,
2378 'PiB': 1024 ** 5,
2379 'PB': 1000 ** 5,
2380 'pB': 1024 ** 5,
2381 'Pb': 1000 ** 5,
2382 'pb': 1000 ** 5,
2383 'petabytes': 1000 ** 5,
2384 'pebibytes': 1024 ** 5,
2385 'EiB': 1024 ** 6,
2386 'EB': 1000 ** 6,
2387 'eB': 1024 ** 6,
2388 'Eb': 1000 ** 6,
2389 'eb': 1000 ** 6,
2390 'exabytes': 1000 ** 6,
2391 'exbibytes': 1024 ** 6,
2392 'ZiB': 1024 ** 7,
2393 'ZB': 1000 ** 7,
2394 'zB': 1024 ** 7,
2395 'Zb': 1000 ** 7,
2396 'zb': 1000 ** 7,
2397 'zettabytes': 1000 ** 7,
2398 'zebibytes': 1024 ** 7,
2399 'YiB': 1024 ** 8,
2400 'YB': 1000 ** 8,
2401 'yB': 1024 ** 8,
2402 'Yb': 1000 ** 8,
2403 'yb': 1000 ** 8,
2404 'yottabytes': 1000 ** 8,
2405 'yobibytes': 1024 ** 8,
2406 }
2407
2408 return lookup_unit_table(_UNIT_TABLE, s)
2409
2410
2411 def parse_count(s):
2412 if s is None:
2413 return None
2414
2415 s = re.sub(r'^[^\d]+\s', '', s).strip()
2416
2417 if re.match(r'^[\d,.]+$', s):
2418 return str_to_int(s)
2419
2420 _UNIT_TABLE = {
2421 'k': 1000,
2422 'K': 1000,
2423 'm': 1000 ** 2,
2424 'M': 1000 ** 2,
2425 'kk': 1000 ** 2,
2426 'KK': 1000 ** 2,
2427 'b': 1000 ** 3,
2428 'B': 1000 ** 3,
2429 }
2430
2431 ret = lookup_unit_table(_UNIT_TABLE, s)
2432 if ret is not None:
2433 return ret
2434
2435 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2436 if mobj:
2437 return str_to_int(mobj.group(1))
2438
2439
2440 def parse_resolution(s, *, lenient=False):
2441 if s is None:
2442 return {}
2443
2444 if lenient:
2445 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2446 else:
2447 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2448 if mobj:
2449 return {
2450 'width': int(mobj.group('w')),
2451 'height': int(mobj.group('h')),
2452 }
2453
2454 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2455 if mobj:
2456 return {'height': int(mobj.group(1))}
2457
2458 mobj = re.search(r'\b([48])[kK]\b', s)
2459 if mobj:
2460 return {'height': int(mobj.group(1)) * 540}
2461
2462 return {}
2463
2464
2465 def parse_bitrate(s):
2466 if not isinstance(s, str):
2467 return
2468 mobj = re.search(r'\b(\d+)\s*kbps', s)
2469 if mobj:
2470 return int(mobj.group(1))
2471
2472
2473 def month_by_name(name, lang='en'):
2474 """ Return the number of a month by (locale-independently) English name """
2475
2476 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2477
2478 try:
2479 return month_names.index(name) + 1
2480 except ValueError:
2481 return None
2482
2483
2484 def month_by_abbreviation(abbrev):
2485 """ Return the number of a month by (locale-independently) English
2486 abbreviations """
2487
2488 try:
2489 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2490 except ValueError:
2491 return None
2492
2493
2494 def fix_xml_ampersands(xml_str):
2495 """Replace all the '&' by '&amp;' in XML"""
2496 return re.sub(
2497 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2498 '&amp;',
2499 xml_str)
2500
2501
2502 def setproctitle(title):
2503 assert isinstance(title, str)
2504
2505 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2506 try:
2507 import ctypes
2508 except ImportError:
2509 return
2510
2511 try:
2512 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2513 except OSError:
2514 return
2515 except TypeError:
2516 # LoadLibrary in Windows Python 2.7.13 only expects
2517 # a bytestring, but since unicode_literals turns
2518 # every string into a unicode string, it fails.
2519 return
2520 title_bytes = title.encode()
2521 buf = ctypes.create_string_buffer(len(title_bytes))
2522 buf.value = title_bytes
2523 try:
2524 libc.prctl(15, buf, 0, 0, 0)
2525 except AttributeError:
2526 return # Strange libc, just skip this
2527
2528
2529 def remove_start(s, start):
2530 return s[len(start):] if s is not None and s.startswith(start) else s
2531
2532
2533 def remove_end(s, end):
2534 return s[:-len(end)] if s is not None and s.endswith(end) else s
2535
2536
2537 def remove_quotes(s):
2538 if s is None or len(s) < 2:
2539 return s
2540 for quote in ('"', "'", ):
2541 if s[0] == quote and s[-1] == quote:
2542 return s[1:-1]
2543 return s
2544
2545
2546 def get_domain(url):
2547 """
2548 This implementation is inconsistent, but is kept for compatibility.
2549 Use this only for "webpage_url_domain"
2550 """
2551 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2552
2553
2554 def url_basename(url):
2555 path = urllib.parse.urlparse(url).path
2556 return path.strip('/').split('/')[-1]
2557
2558
2559 def base_url(url):
2560 return re.match(r'https?://[^?#]+/', url).group()
2561
2562
2563 def urljoin(base, path):
2564 if isinstance(path, bytes):
2565 path = path.decode()
2566 if not isinstance(path, str) or not path:
2567 return None
2568 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2569 return path
2570 if isinstance(base, bytes):
2571 base = base.decode()
2572 if not isinstance(base, str) or not re.match(
2573 r'^(?:https?:)?//', base):
2574 return None
2575 return urllib.parse.urljoin(base, path)
2576
2577
2578 class HEADRequest(urllib.request.Request):
2579 def get_method(self):
2580 return 'HEAD'
2581
2582
2583 class PUTRequest(urllib.request.Request):
2584 def get_method(self):
2585 return 'PUT'
2586
2587
2588 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2589 if get_attr and v is not None:
2590 v = getattr(v, get_attr, None)
2591 try:
2592 return int(v) * invscale // scale
2593 except (ValueError, TypeError, OverflowError):
2594 return default
2595
2596
2597 def str_or_none(v, default=None):
2598 return default if v is None else str(v)
2599
2600
2601 def str_to_int(int_str):
2602 """ A more relaxed version of int_or_none """
2603 if isinstance(int_str, int):
2604 return int_str
2605 elif isinstance(int_str, str):
2606 int_str = re.sub(r'[,\.\+]', '', int_str)
2607 return int_or_none(int_str)
2608
2609
2610 def float_or_none(v, scale=1, invscale=1, default=None):
2611 if v is None:
2612 return default
2613 try:
2614 return float(v) * invscale / scale
2615 except (ValueError, TypeError):
2616 return default
2617
2618
2619 def bool_or_none(v, default=None):
2620 return v if isinstance(v, bool) else default
2621
2622
2623 def strip_or_none(v, default=None):
2624 return v.strip() if isinstance(v, str) else default
2625
2626
2627 def url_or_none(url):
2628 if not url or not isinstance(url, str):
2629 return None
2630 url = url.strip()
2631 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2632
2633
2634 def request_to_url(req):
2635 if isinstance(req, urllib.request.Request):
2636 return req.get_full_url()
2637 else:
2638 return req
2639
2640
2641 def strftime_or_none(timestamp, date_format, default=None):
2642 datetime_object = None
2643 try:
2644 if isinstance(timestamp, (int, float)): # unix timestamp
2645 # Using naive datetime here can break timestamp() in Windows
2646 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2647 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2648 elif isinstance(timestamp, str): # assume YYYYMMDD
2649 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2650 date_format = re.sub( # Support %s on windows
2651 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2652 return datetime_object.strftime(date_format)
2653 except (ValueError, TypeError, AttributeError):
2654 return default
2655
2656
2657 def parse_duration(s):
2658 if not isinstance(s, str):
2659 return None
2660 s = s.strip()
2661 if not s:
2662 return None
2663
2664 days, hours, mins, secs, ms = [None] * 5
2665 m = re.match(r'''(?x)
2666 (?P<before_secs>
2667 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2668 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2669 (?P<ms>[.:][0-9]+)?Z?$
2670 ''', s)
2671 if m:
2672 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2673 else:
2674 m = re.match(
2675 r'''(?ix)(?:P?
2676 (?:
2677 [0-9]+\s*y(?:ears?)?,?\s*
2678 )?
2679 (?:
2680 [0-9]+\s*m(?:onths?)?,?\s*
2681 )?
2682 (?:
2683 [0-9]+\s*w(?:eeks?)?,?\s*
2684 )?
2685 (?:
2686 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2687 )?
2688 T)?
2689 (?:
2690 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2691 )?
2692 (?:
2693 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2694 )?
2695 (?:
2696 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2697 )?Z?$''', s)
2698 if m:
2699 days, hours, mins, secs, ms = m.groups()
2700 else:
2701 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2702 if m:
2703 hours, mins = m.groups()
2704 else:
2705 return None
2706
2707 if ms:
2708 ms = ms.replace(':', '.')
2709 return sum(float(part or 0) * mult for part, mult in (
2710 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2711
2712
2713 def prepend_extension(filename, ext, expected_real_ext=None):
2714 name, real_ext = os.path.splitext(filename)
2715 return (
2716 f'{name}.{ext}{real_ext}'
2717 if not expected_real_ext or real_ext[1:] == expected_real_ext
2718 else f'{filename}.{ext}')
2719
2720
2721 def replace_extension(filename, ext, expected_real_ext=None):
2722 name, real_ext = os.path.splitext(filename)
2723 return '{}.{}'.format(
2724 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2725 ext)
2726
2727
2728 def check_executable(exe, args=[]):
2729 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2730 args can be a list of arguments for a short output (like -version) """
2731 try:
2732 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2733 except OSError:
2734 return False
2735 return exe
2736
2737
2738 def _get_exe_version_output(exe, args):
2739 try:
2740 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2741 # SIGTTOU if yt-dlp is run in the background.
2742 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2743 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2744 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2745 if ret:
2746 return None
2747 except OSError:
2748 return False
2749 return stdout
2750
2751
2752 def detect_exe_version(output, version_re=None, unrecognized='present'):
2753 assert isinstance(output, str)
2754 if version_re is None:
2755 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2756 m = re.search(version_re, output)
2757 if m:
2758 return m.group(1)
2759 else:
2760 return unrecognized
2761
2762
2763 def get_exe_version(exe, args=['--version'],
2764 version_re=None, unrecognized=('present', 'broken')):
2765 """ Returns the version of the specified executable,
2766 or False if the executable is not present """
2767 unrecognized = variadic(unrecognized)
2768 assert len(unrecognized) in (1, 2)
2769 out = _get_exe_version_output(exe, args)
2770 if out is None:
2771 return unrecognized[-1]
2772 return out and detect_exe_version(out, version_re, unrecognized[0])
2773
2774
2775 def frange(start=0, stop=None, step=1):
2776 """Float range"""
2777 if stop is None:
2778 start, stop = 0, start
2779 sign = [-1, 1][step > 0] if step else 0
2780 while sign * start < sign * stop:
2781 yield start
2782 start += step
2783
2784
2785 class LazyList(collections.abc.Sequence):
2786 """Lazy immutable list from an iterable
2787 Note that slices of a LazyList are lists and not LazyList"""
2788
2789 class IndexError(IndexError):
2790 pass
2791
2792 def __init__(self, iterable, *, reverse=False, _cache=None):
2793 self._iterable = iter(iterable)
2794 self._cache = [] if _cache is None else _cache
2795 self._reversed = reverse
2796
2797 def __iter__(self):
2798 if self._reversed:
2799 # We need to consume the entire iterable to iterate in reverse
2800 yield from self.exhaust()
2801 return
2802 yield from self._cache
2803 for item in self._iterable:
2804 self._cache.append(item)
2805 yield item
2806
2807 def _exhaust(self):
2808 self._cache.extend(self._iterable)
2809 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2810 return self._cache
2811
2812 def exhaust(self):
2813 """Evaluate the entire iterable"""
2814 return self._exhaust()[::-1 if self._reversed else 1]
2815
2816 @staticmethod
2817 def _reverse_index(x):
2818 return None if x is None else ~x
2819
2820 def __getitem__(self, idx):
2821 if isinstance(idx, slice):
2822 if self._reversed:
2823 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2824 start, stop, step = idx.start, idx.stop, idx.step or 1
2825 elif isinstance(idx, int):
2826 if self._reversed:
2827 idx = self._reverse_index(idx)
2828 start, stop, step = idx, idx, 0
2829 else:
2830 raise TypeError('indices must be integers or slices')
2831 if ((start or 0) < 0 or (stop or 0) < 0
2832 or (start is None and step < 0)
2833 or (stop is None and step > 0)):
2834 # We need to consume the entire iterable to be able to slice from the end
2835 # Obviously, never use this with infinite iterables
2836 self._exhaust()
2837 try:
2838 return self._cache[idx]
2839 except IndexError as e:
2840 raise self.IndexError(e) from e
2841 n = max(start or 0, stop or 0) - len(self._cache) + 1
2842 if n > 0:
2843 self._cache.extend(itertools.islice(self._iterable, n))
2844 try:
2845 return self._cache[idx]
2846 except IndexError as e:
2847 raise self.IndexError(e) from e
2848
2849 def __bool__(self):
2850 try:
2851 self[-1] if self._reversed else self[0]
2852 except self.IndexError:
2853 return False
2854 return True
2855
2856 def __len__(self):
2857 self._exhaust()
2858 return len(self._cache)
2859
2860 def __reversed__(self):
2861 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2862
2863 def __copy__(self):
2864 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2865
2866 def __repr__(self):
2867 # repr and str should mimic a list. So we exhaust the iterable
2868 return repr(self.exhaust())
2869
2870 def __str__(self):
2871 return repr(self.exhaust())
2872
2873
2874 class PagedList:
2875
2876 class IndexError(IndexError):
2877 pass
2878
2879 def __len__(self):
2880 # This is only useful for tests
2881 return len(self.getslice())
2882
2883 def __init__(self, pagefunc, pagesize, use_cache=True):
2884 self._pagefunc = pagefunc
2885 self._pagesize = pagesize
2886 self._pagecount = float('inf')
2887 self._use_cache = use_cache
2888 self._cache = {}
2889
2890 def getpage(self, pagenum):
2891 page_results = self._cache.get(pagenum)
2892 if page_results is None:
2893 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2894 if self._use_cache:
2895 self._cache[pagenum] = page_results
2896 return page_results
2897
2898 def getslice(self, start=0, end=None):
2899 return list(self._getslice(start, end))
2900
2901 def _getslice(self, start, end):
2902 raise NotImplementedError('This method must be implemented by subclasses')
2903
2904 def __getitem__(self, idx):
2905 assert self._use_cache, 'Indexing PagedList requires cache'
2906 if not isinstance(idx, int) or idx < 0:
2907 raise TypeError('indices must be non-negative integers')
2908 entries = self.getslice(idx, idx + 1)
2909 if not entries:
2910 raise self.IndexError()
2911 return entries[0]
2912
2913
2914 class OnDemandPagedList(PagedList):
2915 """Download pages until a page with less than maximum results"""
2916
2917 def _getslice(self, start, end):
2918 for pagenum in itertools.count(start // self._pagesize):
2919 firstid = pagenum * self._pagesize
2920 nextfirstid = pagenum * self._pagesize + self._pagesize
2921 if start >= nextfirstid:
2922 continue
2923
2924 startv = (
2925 start % self._pagesize
2926 if firstid <= start < nextfirstid
2927 else 0)
2928 endv = (
2929 ((end - 1) % self._pagesize) + 1
2930 if (end is not None and firstid <= end <= nextfirstid)
2931 else None)
2932
2933 try:
2934 page_results = self.getpage(pagenum)
2935 except Exception:
2936 self._pagecount = pagenum - 1
2937 raise
2938 if startv != 0 or endv is not None:
2939 page_results = page_results[startv:endv]
2940 yield from page_results
2941
2942 # A little optimization - if current page is not "full", ie. does
2943 # not contain page_size videos then we can assume that this page
2944 # is the last one - there are no more ids on further pages -
2945 # i.e. no need to query again.
2946 if len(page_results) + startv < self._pagesize:
2947 break
2948
2949 # If we got the whole page, but the next page is not interesting,
2950 # break out early as well
2951 if end == nextfirstid:
2952 break
2953
2954
2955 class InAdvancePagedList(PagedList):
2956 """PagedList with total number of pages known in advance"""
2957
2958 def __init__(self, pagefunc, pagecount, pagesize):
2959 PagedList.__init__(self, pagefunc, pagesize, True)
2960 self._pagecount = pagecount
2961
2962 def _getslice(self, start, end):
2963 start_page = start // self._pagesize
2964 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2965 skip_elems = start - start_page * self._pagesize
2966 only_more = None if end is None else end - start
2967 for pagenum in range(start_page, end_page):
2968 page_results = self.getpage(pagenum)
2969 if skip_elems:
2970 page_results = page_results[skip_elems:]
2971 skip_elems = None
2972 if only_more is not None:
2973 if len(page_results) < only_more:
2974 only_more -= len(page_results)
2975 else:
2976 yield from page_results[:only_more]
2977 break
2978 yield from page_results
2979
2980
2981 class PlaylistEntries:
2982 MissingEntry = object()
2983 is_exhausted = False
2984
2985 def __init__(self, ydl, info_dict):
2986 self.ydl = ydl
2987
2988 # _entries must be assigned now since infodict can change during iteration
2989 entries = info_dict.get('entries')
2990 if entries is None:
2991 raise EntryNotInPlaylist('There are no entries')
2992 elif isinstance(entries, list):
2993 self.is_exhausted = True
2994
2995 requested_entries = info_dict.get('requested_entries')
2996 self.is_incomplete = requested_entries is not None
2997 if self.is_incomplete:
2998 assert self.is_exhausted
2999 self._entries = [self.MissingEntry] * max(requested_entries or [0])
3000 for i, entry in zip(requested_entries, entries):
3001 self._entries[i - 1] = entry
3002 elif isinstance(entries, (list, PagedList, LazyList)):
3003 self._entries = entries
3004 else:
3005 self._entries = LazyList(entries)
3006
3007 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
3008 (?P<start>[+-]?\d+)?
3009 (?P<range>[:-]
3010 (?P<end>[+-]?\d+|inf(?:inite)?)?
3011 (?::(?P<step>[+-]?\d+))?
3012 )?''')
3013
3014 @classmethod
3015 def parse_playlist_items(cls, string):
3016 for segment in string.split(','):
3017 if not segment:
3018 raise ValueError('There is two or more consecutive commas')
3019 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
3020 if not mobj:
3021 raise ValueError(f'{segment!r} is not a valid specification')
3022 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
3023 if int_or_none(step) == 0:
3024 raise ValueError(f'Step in {segment!r} cannot be zero')
3025 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3026
3027 def get_requested_items(self):
3028 playlist_items = self.ydl.params.get('playlist_items')
3029 playlist_start = self.ydl.params.get('playliststart', 1)
3030 playlist_end = self.ydl.params.get('playlistend')
3031 # For backwards compatibility, interpret -1 as whole list
3032 if playlist_end in (-1, None):
3033 playlist_end = ''
3034 if not playlist_items:
3035 playlist_items = f'{playlist_start}:{playlist_end}'
3036 elif playlist_start != 1 or playlist_end:
3037 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3038
3039 for index in self.parse_playlist_items(playlist_items):
3040 for i, entry in self[index]:
3041 yield i, entry
3042 if not entry:
3043 continue
3044 try:
3045 # The item may have just been added to archive. Don't break due to it
3046 if not self.ydl.params.get('lazy_playlist'):
3047 # TODO: Add auto-generated fields
3048 self.ydl._match_entry(entry, incomplete=True, silent=True)
3049 except (ExistingVideoReached, RejectedVideoReached):
3050 return
3051
3052 def get_full_count(self):
3053 if self.is_exhausted and not self.is_incomplete:
3054 return len(self)
3055 elif isinstance(self._entries, InAdvancePagedList):
3056 if self._entries._pagesize == 1:
3057 return self._entries._pagecount
3058
3059 @functools.cached_property
3060 def _getter(self):
3061 if isinstance(self._entries, list):
3062 def get_entry(i):
3063 try:
3064 entry = self._entries[i]
3065 except IndexError:
3066 entry = self.MissingEntry
3067 if not self.is_incomplete:
3068 raise self.IndexError()
3069 if entry is self.MissingEntry:
3070 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3071 return entry
3072 else:
3073 def get_entry(i):
3074 try:
3075 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3076 except (LazyList.IndexError, PagedList.IndexError):
3077 raise self.IndexError()
3078 return get_entry
3079
3080 def __getitem__(self, idx):
3081 if isinstance(idx, int):
3082 idx = slice(idx, idx)
3083
3084 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3085 step = 1 if idx.step is None else idx.step
3086 if idx.start is None:
3087 start = 0 if step > 0 else len(self) - 1
3088 else:
3089 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3090
3091 # NB: Do not call len(self) when idx == [:]
3092 if idx.stop is None:
3093 stop = 0 if step < 0 else float('inf')
3094 else:
3095 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3096 stop += [-1, 1][step > 0]
3097
3098 for i in frange(start, stop, step):
3099 if i < 0:
3100 continue
3101 try:
3102 entry = self._getter(i)
3103 except self.IndexError:
3104 self.is_exhausted = True
3105 if step > 0:
3106 break
3107 continue
3108 yield i + 1, entry
3109
3110 def __len__(self):
3111 return len(tuple(self[:]))
3112
3113 class IndexError(IndexError):
3114 pass
3115
3116
3117 def uppercase_escape(s):
3118 unicode_escape = codecs.getdecoder('unicode_escape')
3119 return re.sub(
3120 r'\\U[0-9a-fA-F]{8}',
3121 lambda m: unicode_escape(m.group(0))[0],
3122 s)
3123
3124
3125 def lowercase_escape(s):
3126 unicode_escape = codecs.getdecoder('unicode_escape')
3127 return re.sub(
3128 r'\\u[0-9a-fA-F]{4}',
3129 lambda m: unicode_escape(m.group(0))[0],
3130 s)
3131
3132
3133 def escape_rfc3986(s):
3134 """Escape non-ASCII characters as suggested by RFC 3986"""
3135 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3136
3137
3138 def escape_url(url):
3139 """Escape URL as suggested by RFC 3986"""
3140 url_parsed = urllib.parse.urlparse(url)
3141 return url_parsed._replace(
3142 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3143 path=escape_rfc3986(url_parsed.path),
3144 params=escape_rfc3986(url_parsed.params),
3145 query=escape_rfc3986(url_parsed.query),
3146 fragment=escape_rfc3986(url_parsed.fragment)
3147 ).geturl()
3148
3149
3150 def parse_qs(url, **kwargs):
3151 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3152
3153
3154 def read_batch_urls(batch_fd):
3155 def fixup(url):
3156 if not isinstance(url, str):
3157 url = url.decode('utf-8', 'replace')
3158 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3159 for bom in BOM_UTF8:
3160 if url.startswith(bom):
3161 url = url[len(bom):]
3162 url = url.lstrip()
3163 if not url or url.startswith(('#', ';', ']')):
3164 return False
3165 # "#" cannot be stripped out since it is part of the URI
3166 # However, it can be safely stripped out if following a whitespace
3167 return re.split(r'\s#', url, 1)[0].rstrip()
3168
3169 with contextlib.closing(batch_fd) as fd:
3170 return [url for url in map(fixup, fd) if url]
3171
3172
3173 def urlencode_postdata(*args, **kargs):
3174 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3175
3176
3177 def update_url(url, *, query_update=None, **kwargs):
3178 """Replace URL components specified by kwargs
3179 @param url str or parse url tuple
3180 @param query_update update query
3181 @returns str
3182 """
3183 if isinstance(url, str):
3184 if not kwargs and not query_update:
3185 return url
3186 else:
3187 url = urllib.parse.urlparse(url)
3188 if query_update:
3189 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3190 kwargs['query'] = urllib.parse.urlencode({
3191 **urllib.parse.parse_qs(url.query),
3192 **query_update
3193 }, True)
3194 return urllib.parse.urlunparse(url._replace(**kwargs))
3195
3196
3197 def update_url_query(url, query):
3198 return update_url(url, query_update=query)
3199
3200
3201 def update_Request(req, url=None, data=None, headers=None, query=None):
3202 req_headers = req.headers.copy()
3203 req_headers.update(headers or {})
3204 req_data = data or req.data
3205 req_url = update_url_query(url or req.get_full_url(), query)
3206 req_get_method = req.get_method()
3207 if req_get_method == 'HEAD':
3208 req_type = HEADRequest
3209 elif req_get_method == 'PUT':
3210 req_type = PUTRequest
3211 else:
3212 req_type = urllib.request.Request
3213 new_req = req_type(
3214 req_url, data=req_data, headers=req_headers,
3215 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3216 if hasattr(req, 'timeout'):
3217 new_req.timeout = req.timeout
3218 return new_req
3219
3220
3221 def _multipart_encode_impl(data, boundary):
3222 content_type = 'multipart/form-data; boundary=%s' % boundary
3223
3224 out = b''
3225 for k, v in data.items():
3226 out += b'--' + boundary.encode('ascii') + b'\r\n'
3227 if isinstance(k, str):
3228 k = k.encode()
3229 if isinstance(v, str):
3230 v = v.encode()
3231 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3232 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3233 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3234 if boundary.encode('ascii') in content:
3235 raise ValueError('Boundary overlaps with data')
3236 out += content
3237
3238 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3239
3240 return out, content_type
3241
3242
3243 def multipart_encode(data, boundary=None):
3244 '''
3245 Encode a dict to RFC 7578-compliant form-data
3246
3247 data:
3248 A dict where keys and values can be either Unicode or bytes-like
3249 objects.
3250 boundary:
3251 If specified a Unicode object, it's used as the boundary. Otherwise
3252 a random boundary is generated.
3253
3254 Reference: https://tools.ietf.org/html/rfc7578
3255 '''
3256 has_specified_boundary = boundary is not None
3257
3258 while True:
3259 if boundary is None:
3260 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3261
3262 try:
3263 out, content_type = _multipart_encode_impl(data, boundary)
3264 break
3265 except ValueError:
3266 if has_specified_boundary:
3267 raise
3268 boundary = None
3269
3270 return out, content_type
3271
3272
3273 def variadic(x, allowed_types=(str, bytes, dict)):
3274 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3275
3276
3277 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3278 for val in map(d.get, variadic(key_or_keys)):
3279 if val is not None and (val or not skip_false_values):
3280 return val
3281 return default
3282
3283
3284 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3285 for f in funcs:
3286 try:
3287 val = f(*args, **kwargs)
3288 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3289 pass
3290 else:
3291 if expected_type is None or isinstance(val, expected_type):
3292 return val
3293
3294
3295 def try_get(src, getter, expected_type=None):
3296 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3297
3298
3299 def filter_dict(dct, cndn=lambda _, v: v is not None):
3300 return {k: v for k, v in dct.items() if cndn(k, v)}
3301
3302
3303 def merge_dicts(*dicts):
3304 merged = {}
3305 for a_dict in dicts:
3306 for k, v in a_dict.items():
3307 if (v is not None and k not in merged
3308 or isinstance(v, str) and merged[k] == ''):
3309 merged[k] = v
3310 return merged
3311
3312
3313 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3314 return string if isinstance(string, str) else str(string, encoding, errors)
3315
3316
3317 US_RATINGS = {
3318 'G': 0,
3319 'PG': 10,
3320 'PG-13': 13,
3321 'R': 16,
3322 'NC': 18,
3323 }
3324
3325
3326 TV_PARENTAL_GUIDELINES = {
3327 'TV-Y': 0,
3328 'TV-Y7': 7,
3329 'TV-G': 0,
3330 'TV-PG': 0,
3331 'TV-14': 14,
3332 'TV-MA': 17,
3333 }
3334
3335
3336 def parse_age_limit(s):
3337 # isinstance(False, int) is True. So type() must be used instead
3338 if type(s) is int: # noqa: E721
3339 return s if 0 <= s <= 21 else None
3340 elif not isinstance(s, str):
3341 return None
3342 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3343 if m:
3344 return int(m.group('age'))
3345 s = s.upper()
3346 if s in US_RATINGS:
3347 return US_RATINGS[s]
3348 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3349 if m:
3350 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3351 return None
3352
3353
3354 def strip_jsonp(code):
3355 return re.sub(
3356 r'''(?sx)^
3357 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3358 (?:\s*&&\s*(?P=func_name))?
3359 \s*\(\s*(?P<callback_data>.*)\);?
3360 \s*?(?://[^\n]*)*$''',
3361 r'\g<callback_data>', code)
3362
3363
3364 def js_to_json(code, vars={}, *, strict=False):
3365 # vars is a dict of var, val pairs to substitute
3366 STRING_QUOTES = '\'"'
3367 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3368 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3369 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3370 INTEGER_TABLE = (
3371 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3372 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3373 )
3374
3375 def process_escape(match):
3376 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3377 escape = match.group(1) or match.group(2)
3378
3379 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3380 else R'\u00' if escape == 'x'
3381 else '' if escape == '\n'
3382 else escape)
3383
3384 def fix_kv(m):
3385 v = m.group(0)
3386 if v in ('true', 'false', 'null'):
3387 return v
3388 elif v in ('undefined', 'void 0'):
3389 return 'null'
3390 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3391 return ''
3392
3393 if v[0] in STRING_QUOTES:
3394 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3395 return f'"{escaped}"'
3396
3397 for regex, base in INTEGER_TABLE:
3398 im = re.match(regex, v)
3399 if im:
3400 i = int(im.group(1), base)
3401 return f'"{i}":' if v.endswith(':') else str(i)
3402
3403 if v in vars:
3404 try:
3405 if not strict:
3406 json.loads(vars[v])
3407 except json.JSONDecodeError:
3408 return json.dumps(vars[v])
3409 else:
3410 return vars[v]
3411
3412 if not strict:
3413 return f'"{v}"'
3414
3415 raise ValueError(f'Unknown value: {v}')
3416
3417 def create_map(mobj):
3418 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3419
3420 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3421 if not strict:
3422 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3423 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3424 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3425 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3426
3427 return re.sub(rf'''(?sx)
3428 {STRING_RE}|
3429 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3430 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3431 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3432 [0-9]+(?={SKIP_RE}:)|
3433 !+
3434 ''', fix_kv, code)
3435
3436
3437 def qualities(quality_ids):
3438 """ Get a numeric quality value out of a list of possible values """
3439 def q(qid):
3440 try:
3441 return quality_ids.index(qid)
3442 except ValueError:
3443 return -1
3444 return q
3445
3446
3447 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3448
3449
3450 DEFAULT_OUTTMPL = {
3451 'default': '%(title)s [%(id)s].%(ext)s',
3452 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3453 }
3454 OUTTMPL_TYPES = {
3455 'chapter': None,
3456 'subtitle': None,
3457 'thumbnail': None,
3458 'description': 'description',
3459 'annotation': 'annotations.xml',
3460 'infojson': 'info.json',
3461 'link': None,
3462 'pl_video': None,
3463 'pl_thumbnail': None,
3464 'pl_description': 'description',
3465 'pl_infojson': 'info.json',
3466 }
3467
3468 # As of [1] format syntax is:
3469 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3470 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3471 STR_FORMAT_RE_TMPL = r'''(?x)
3472 (?<!%)(?P<prefix>(?:%%)*)
3473 %
3474 (?P<has_key>\((?P<key>{0})\))?
3475 (?P<format>
3476 (?P<conversion>[#0\-+ ]+)?
3477 (?P<min_width>\d+)?
3478 (?P<precision>\.\d+)?
3479 (?P<len_mod>[hlL])? # unused in python
3480 {1} # conversion type
3481 )
3482 '''
3483
3484
3485 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3486
3487
3488 def limit_length(s, length):
3489 """ Add ellipses to overly long strings """
3490 if s is None:
3491 return None
3492 ELLIPSES = '...'
3493 if len(s) > length:
3494 return s[:length - len(ELLIPSES)] + ELLIPSES
3495 return s
3496
3497
3498 def version_tuple(v):
3499 return tuple(int(e) for e in re.split(r'[-.]', v))
3500
3501
3502 def is_outdated_version(version, limit, assume_new=True):
3503 if not version:
3504 return not assume_new
3505 try:
3506 return version_tuple(version) < version_tuple(limit)
3507 except ValueError:
3508 return not assume_new
3509
3510
3511 def ytdl_is_updateable():
3512 """ Returns if yt-dlp can be updated with -U """
3513
3514 from .update import is_non_updateable
3515
3516 return not is_non_updateable()
3517
3518
3519 def args_to_str(args):
3520 # Get a short string representation for a subprocess command
3521 return ' '.join(compat_shlex_quote(a) for a in args)
3522
3523
3524 def error_to_compat_str(err):
3525 return str(err)
3526
3527
3528 def error_to_str(err):
3529 return f'{type(err).__name__}: {err}'
3530
3531
3532 def mimetype2ext(mt, default=NO_DEFAULT):
3533 if not isinstance(mt, str):
3534 if default is not NO_DEFAULT:
3535 return default
3536 return None
3537
3538 MAP = {
3539 # video
3540 '3gpp': '3gp',
3541 'mp2t': 'ts',
3542 'mp4': 'mp4',
3543 'mpeg': 'mpeg',
3544 'mpegurl': 'm3u8',
3545 'quicktime': 'mov',
3546 'webm': 'webm',
3547 'vp9': 'vp9',
3548 'x-flv': 'flv',
3549 'x-m4v': 'm4v',
3550 'x-matroska': 'mkv',
3551 'x-mng': 'mng',
3552 'x-mp4-fragmented': 'mp4',
3553 'x-ms-asf': 'asf',
3554 'x-ms-wmv': 'wmv',
3555 'x-msvideo': 'avi',
3556
3557 # application (streaming playlists)
3558 'dash+xml': 'mpd',
3559 'f4m+xml': 'f4m',
3560 'hds+xml': 'f4m',
3561 'vnd.apple.mpegurl': 'm3u8',
3562 'vnd.ms-sstr+xml': 'ism',
3563 'x-mpegurl': 'm3u8',
3564
3565 # audio
3566 'audio/mp4': 'm4a',
3567 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3568 # Using .mp3 as it's the most popular one
3569 'audio/mpeg': 'mp3',
3570 'audio/webm': 'webm',
3571 'audio/x-matroska': 'mka',
3572 'audio/x-mpegurl': 'm3u',
3573 'midi': 'mid',
3574 'ogg': 'ogg',
3575 'wav': 'wav',
3576 'wave': 'wav',
3577 'x-aac': 'aac',
3578 'x-flac': 'flac',
3579 'x-m4a': 'm4a',
3580 'x-realaudio': 'ra',
3581 'x-wav': 'wav',
3582
3583 # image
3584 'avif': 'avif',
3585 'bmp': 'bmp',
3586 'gif': 'gif',
3587 'jpeg': 'jpg',
3588 'png': 'png',
3589 'svg+xml': 'svg',
3590 'tiff': 'tif',
3591 'vnd.wap.wbmp': 'wbmp',
3592 'webp': 'webp',
3593 'x-icon': 'ico',
3594 'x-jng': 'jng',
3595 'x-ms-bmp': 'bmp',
3596
3597 # caption
3598 'filmstrip+json': 'fs',
3599 'smptett+xml': 'tt',
3600 'ttaf+xml': 'dfxp',
3601 'ttml+xml': 'ttml',
3602 'x-ms-sami': 'sami',
3603
3604 # misc
3605 'gzip': 'gz',
3606 'json': 'json',
3607 'xml': 'xml',
3608 'zip': 'zip',
3609 }
3610
3611 mimetype = mt.partition(';')[0].strip().lower()
3612 _, _, subtype = mimetype.rpartition('/')
3613
3614 ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3615 if ext:
3616 return ext
3617 elif default is not NO_DEFAULT:
3618 return default
3619 return subtype.replace('+', '.')
3620
3621
3622 def ext2mimetype(ext_or_url):
3623 if not ext_or_url:
3624 return None
3625 if '.' not in ext_or_url:
3626 ext_or_url = f'file.{ext_or_url}'
3627 return mimetypes.guess_type(ext_or_url)[0]
3628
3629
3630 def parse_codecs(codecs_str):
3631 # http://tools.ietf.org/html/rfc6381
3632 if not codecs_str:
3633 return {}
3634 split_codecs = list(filter(None, map(
3635 str.strip, codecs_str.strip().strip(',').split(','))))
3636 vcodec, acodec, scodec, hdr = None, None, None, None
3637 for full_codec in split_codecs:
3638 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3639 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3640 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3641 if vcodec:
3642 continue
3643 vcodec = full_codec
3644 if parts[0] in ('dvh1', 'dvhe'):
3645 hdr = 'DV'
3646 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3647 hdr = 'HDR10'
3648 elif parts[:2] == ['vp9', '2']:
3649 hdr = 'HDR10'
3650 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3651 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3652 acodec = acodec or full_codec
3653 elif parts[0] in ('stpp', 'wvtt'):
3654 scodec = scodec or full_codec
3655 else:
3656 write_string(f'WARNING: Unknown codec {full_codec}\n')
3657 if vcodec or acodec or scodec:
3658 return {
3659 'vcodec': vcodec or 'none',
3660 'acodec': acodec or 'none',
3661 'dynamic_range': hdr,
3662 **({'scodec': scodec} if scodec is not None else {}),
3663 }
3664 elif len(split_codecs) == 2:
3665 return {
3666 'vcodec': split_codecs[0],
3667 'acodec': split_codecs[1],
3668 }
3669 return {}
3670
3671
3672 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3673 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3674
3675 allow_mkv = not preferences or 'mkv' in preferences
3676
3677 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3678 return 'mkv' # TODO: any other format allows this?
3679
3680 # TODO: All codecs supported by parse_codecs isn't handled here
3681 COMPATIBLE_CODECS = {
3682 'mp4': {
3683 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3684 'h264', 'aacl', 'ec-3', # Set in ISM
3685 },
3686 'webm': {
3687 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3688 'vp9x', 'vp8x', # in the webm spec
3689 },
3690 }
3691
3692 sanitize_codec = functools.partial(
3693 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3694 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3695
3696 for ext in preferences or COMPATIBLE_CODECS.keys():
3697 codec_set = COMPATIBLE_CODECS.get(ext, set())
3698 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3699 return ext
3700
3701 COMPATIBLE_EXTS = (
3702 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3703 {'webm', 'weba'},
3704 )
3705 for ext in preferences or vexts:
3706 current_exts = {ext, *vexts, *aexts}
3707 if ext == 'mkv' or current_exts == {ext} or any(
3708 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3709 return ext
3710 return 'mkv' if allow_mkv else preferences[-1]
3711
3712
3713 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3714 getheader = url_handle.headers.get
3715
3716 cd = getheader('Content-Disposition')
3717 if cd:
3718 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3719 if m:
3720 e = determine_ext(m.group('filename'), default_ext=None)
3721 if e:
3722 return e
3723
3724 meta_ext = getheader('x-amz-meta-name')
3725 if meta_ext:
3726 e = meta_ext.rpartition('.')[2]
3727 if e:
3728 return e
3729
3730 return mimetype2ext(getheader('Content-Type'), default=default)
3731
3732
3733 def encode_data_uri(data, mime_type):
3734 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3735
3736
3737 def age_restricted(content_limit, age_limit):
3738 """ Returns True iff the content should be blocked """
3739
3740 if age_limit is None: # No limit set
3741 return False
3742 if content_limit is None:
3743 return False # Content available for everyone
3744 return age_limit < content_limit
3745
3746
3747 # List of known byte-order-marks (BOM)
3748 BOMS = [
3749 (b'\xef\xbb\xbf', 'utf-8'),
3750 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3751 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3752 (b'\xff\xfe', 'utf-16-le'),
3753 (b'\xfe\xff', 'utf-16-be'),
3754 ]
3755
3756
3757 def is_html(first_bytes):
3758 """ Detect whether a file contains HTML by examining its first bytes. """
3759
3760 encoding = 'utf-8'
3761 for bom, enc in BOMS:
3762 while first_bytes.startswith(bom):
3763 encoding, first_bytes = enc, first_bytes[len(bom):]
3764
3765 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3766
3767
3768 def determine_protocol(info_dict):
3769 protocol = info_dict.get('protocol')
3770 if protocol is not None:
3771 return protocol
3772
3773 url = sanitize_url(info_dict['url'])
3774 if url.startswith('rtmp'):
3775 return 'rtmp'
3776 elif url.startswith('mms'):
3777 return 'mms'
3778 elif url.startswith('rtsp'):
3779 return 'rtsp'
3780
3781 ext = determine_ext(url)
3782 if ext == 'm3u8':
3783 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3784 elif ext == 'f4m':
3785 return 'f4m'
3786
3787 return urllib.parse.urlparse(url).scheme
3788
3789
3790 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3791 """ Render a list of rows, each as a list of values.
3792 Text after a \t will be right aligned """
3793 def width(string):
3794 return len(remove_terminal_sequences(string).replace('\t', ''))
3795
3796 def get_max_lens(table):
3797 return [max(width(str(v)) for v in col) for col in zip(*table)]
3798
3799 def filter_using_list(row, filterArray):
3800 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3801
3802 max_lens = get_max_lens(data) if hide_empty else []
3803 header_row = filter_using_list(header_row, max_lens)
3804 data = [filter_using_list(row, max_lens) for row in data]
3805
3806 table = [header_row] + data
3807 max_lens = get_max_lens(table)
3808 extra_gap += 1
3809 if delim:
3810 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3811 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3812 for row in table:
3813 for pos, text in enumerate(map(str, row)):
3814 if '\t' in text:
3815 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3816 else:
3817 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3818 ret = '\n'.join(''.join(row).rstrip() for row in table)
3819 return ret
3820
3821
3822 def _match_one(filter_part, dct, incomplete):
3823 # TODO: Generalize code with YoutubeDL._build_format_filter
3824 STRING_OPERATORS = {
3825 '*=': operator.contains,
3826 '^=': lambda attr, value: attr.startswith(value),
3827 '$=': lambda attr, value: attr.endswith(value),
3828 '~=': lambda attr, value: re.search(value, attr),
3829 }
3830 COMPARISON_OPERATORS = {
3831 **STRING_OPERATORS,
3832 '<=': operator.le, # "<=" must be defined above "<"
3833 '<': operator.lt,
3834 '>=': operator.ge,
3835 '>': operator.gt,
3836 '=': operator.eq,
3837 }
3838
3839 if isinstance(incomplete, bool):
3840 is_incomplete = lambda _: incomplete
3841 else:
3842 is_incomplete = lambda k: k in incomplete
3843
3844 operator_rex = re.compile(r'''(?x)
3845 (?P<key>[a-z_]+)
3846 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3847 (?:
3848 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3849 (?P<strval>.+?)
3850 )
3851 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3852 m = operator_rex.fullmatch(filter_part.strip())
3853 if m:
3854 m = m.groupdict()
3855 unnegated_op = COMPARISON_OPERATORS[m['op']]
3856 if m['negation']:
3857 op = lambda attr, value: not unnegated_op(attr, value)
3858 else:
3859 op = unnegated_op
3860 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3861 if m['quote']:
3862 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3863 actual_value = dct.get(m['key'])
3864 numeric_comparison = None
3865 if isinstance(actual_value, (int, float)):
3866 # If the original field is a string and matching comparisonvalue is
3867 # a number we should respect the origin of the original field
3868 # and process comparison value as a string (see
3869 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3870 try:
3871 numeric_comparison = int(comparison_value)
3872 except ValueError:
3873 numeric_comparison = parse_filesize(comparison_value)
3874 if numeric_comparison is None:
3875 numeric_comparison = parse_filesize(f'{comparison_value}B')
3876 if numeric_comparison is None:
3877 numeric_comparison = parse_duration(comparison_value)
3878 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3879 raise ValueError('Operator %s only supports string values!' % m['op'])
3880 if actual_value is None:
3881 return is_incomplete(m['key']) or m['none_inclusive']
3882 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3883
3884 UNARY_OPERATORS = {
3885 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3886 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3887 }
3888 operator_rex = re.compile(r'''(?x)
3889 (?P<op>%s)\s*(?P<key>[a-z_]+)
3890 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3891 m = operator_rex.fullmatch(filter_part.strip())
3892 if m:
3893 op = UNARY_OPERATORS[m.group('op')]
3894 actual_value = dct.get(m.group('key'))
3895 if is_incomplete(m.group('key')) and actual_value is None:
3896 return True
3897 return op(actual_value)
3898
3899 raise ValueError('Invalid filter part %r' % filter_part)
3900
3901
3902 def match_str(filter_str, dct, incomplete=False):
3903 """ Filter a dictionary with a simple string syntax.
3904 @returns Whether the filter passes
3905 @param incomplete Set of keys that is expected to be missing from dct.
3906 Can be True/False to indicate all/none of the keys may be missing.
3907 All conditions on incomplete keys pass if the key is missing
3908 """
3909 return all(
3910 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3911 for filter_part in re.split(r'(?<!\\)&', filter_str))
3912
3913
3914 def match_filter_func(filters, breaking_filters=None):
3915 if not filters and not breaking_filters:
3916 return None
3917 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3918 filters = set(variadic(filters or []))
3919
3920 interactive = '-' in filters
3921 if interactive:
3922 filters.remove('-')
3923
3924 def _match_func(info_dict, incomplete=False):
3925 ret = breaking_filters(info_dict, incomplete)
3926 if ret is not None:
3927 raise RejectedVideoReached(ret)
3928
3929 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3930 return NO_DEFAULT if interactive and not incomplete else None
3931 else:
3932 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3933 filter_str = ') | ('.join(map(str.strip, filters))
3934 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3935 return _match_func
3936
3937
3938 class download_range_func:
3939 def __init__(self, chapters, ranges):
3940 self.chapters, self.ranges = chapters, ranges
3941
3942 def __call__(self, info_dict, ydl):
3943 if not self.ranges and not self.chapters:
3944 yield {}
3945
3946 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3947 else 'Cannot match chapters since chapter information is unavailable')
3948 for regex in self.chapters or []:
3949 for i, chapter in enumerate(info_dict.get('chapters') or []):
3950 if re.search(regex, chapter['title']):
3951 warning = None
3952 yield {**chapter, 'index': i}
3953 if self.chapters and warning:
3954 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3955
3956 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3957
3958 def __eq__(self, other):
3959 return (isinstance(other, download_range_func)
3960 and self.chapters == other.chapters and self.ranges == other.ranges)
3961
3962 def __repr__(self):
3963 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3964
3965
3966 def parse_dfxp_time_expr(time_expr):
3967 if not time_expr:
3968 return
3969
3970 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3971 if mobj:
3972 return float(mobj.group('time_offset'))
3973
3974 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3975 if mobj:
3976 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3977
3978
3979 def srt_subtitles_timecode(seconds):
3980 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3981
3982
3983 def ass_subtitles_timecode(seconds):
3984 time = timetuple_from_msec(seconds * 1000)
3985 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3986
3987
3988 def dfxp2srt(dfxp_data):
3989 '''
3990 @param dfxp_data A bytes-like object containing DFXP data
3991 @returns A unicode object containing converted SRT data
3992 '''
3993 LEGACY_NAMESPACES = (
3994 (b'http://www.w3.org/ns/ttml', [
3995 b'http://www.w3.org/2004/11/ttaf1',
3996 b'http://www.w3.org/2006/04/ttaf1',
3997 b'http://www.w3.org/2006/10/ttaf1',
3998 ]),
3999 (b'http://www.w3.org/ns/ttml#styling', [
4000 b'http://www.w3.org/ns/ttml#style',
4001 ]),
4002 )
4003
4004 SUPPORTED_STYLING = [
4005 'color',
4006 'fontFamily',
4007 'fontSize',
4008 'fontStyle',
4009 'fontWeight',
4010 'textDecoration'
4011 ]
4012
4013 _x = functools.partial(xpath_with_ns, ns_map={
4014 'xml': 'http://www.w3.org/XML/1998/namespace',
4015 'ttml': 'http://www.w3.org/ns/ttml',
4016 'tts': 'http://www.w3.org/ns/ttml#styling',
4017 })
4018
4019 styles = {}
4020 default_style = {}
4021
4022 class TTMLPElementParser:
4023 _out = ''
4024 _unclosed_elements = []
4025 _applied_styles = []
4026
4027 def start(self, tag, attrib):
4028 if tag in (_x('ttml:br'), 'br'):
4029 self._out += '\n'
4030 else:
4031 unclosed_elements = []
4032 style = {}
4033 element_style_id = attrib.get('style')
4034 if default_style:
4035 style.update(default_style)
4036 if element_style_id:
4037 style.update(styles.get(element_style_id, {}))
4038 for prop in SUPPORTED_STYLING:
4039 prop_val = attrib.get(_x('tts:' + prop))
4040 if prop_val:
4041 style[prop] = prop_val
4042 if style:
4043 font = ''
4044 for k, v in sorted(style.items()):
4045 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4046 continue
4047 if k == 'color':
4048 font += ' color="%s"' % v
4049 elif k == 'fontSize':
4050 font += ' size="%s"' % v
4051 elif k == 'fontFamily':
4052 font += ' face="%s"' % v
4053 elif k == 'fontWeight' and v == 'bold':
4054 self._out += '<b>'
4055 unclosed_elements.append('b')
4056 elif k == 'fontStyle' and v == 'italic':
4057 self._out += '<i>'
4058 unclosed_elements.append('i')
4059 elif k == 'textDecoration' and v == 'underline':
4060 self._out += '<u>'
4061 unclosed_elements.append('u')
4062 if font:
4063 self._out += '<font' + font + '>'
4064 unclosed_elements.append('font')
4065 applied_style = {}
4066 if self._applied_styles:
4067 applied_style.update(self._applied_styles[-1])
4068 applied_style.update(style)
4069 self._applied_styles.append(applied_style)
4070 self._unclosed_elements.append(unclosed_elements)
4071
4072 def end(self, tag):
4073 if tag not in (_x('ttml:br'), 'br'):
4074 unclosed_elements = self._unclosed_elements.pop()
4075 for element in reversed(unclosed_elements):
4076 self._out += '</%s>' % element
4077 if unclosed_elements and self._applied_styles:
4078 self._applied_styles.pop()
4079
4080 def data(self, data):
4081 self._out += data
4082
4083 def close(self):
4084 return self._out.strip()
4085
4086 def parse_node(node):
4087 target = TTMLPElementParser()
4088 parser = xml.etree.ElementTree.XMLParser(target=target)
4089 parser.feed(xml.etree.ElementTree.tostring(node))
4090 return parser.close()
4091
4092 for k, v in LEGACY_NAMESPACES:
4093 for ns in v:
4094 dfxp_data = dfxp_data.replace(ns, k)
4095
4096 dfxp = compat_etree_fromstring(dfxp_data)
4097 out = []
4098 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4099
4100 if not paras:
4101 raise ValueError('Invalid dfxp/TTML subtitle')
4102
4103 repeat = False
4104 while True:
4105 for style in dfxp.findall(_x('.//ttml:style')):
4106 style_id = style.get('id') or style.get(_x('xml:id'))
4107 if not style_id:
4108 continue
4109 parent_style_id = style.get('style')
4110 if parent_style_id:
4111 if parent_style_id not in styles:
4112 repeat = True
4113 continue
4114 styles[style_id] = styles[parent_style_id].copy()
4115 for prop in SUPPORTED_STYLING:
4116 prop_val = style.get(_x('tts:' + prop))
4117 if prop_val:
4118 styles.setdefault(style_id, {})[prop] = prop_val
4119 if repeat:
4120 repeat = False
4121 else:
4122 break
4123
4124 for p in ('body', 'div'):
4125 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4126 if ele is None:
4127 continue
4128 style = styles.get(ele.get('style'))
4129 if not style:
4130 continue
4131 default_style.update(style)
4132
4133 for para, index in zip(paras, itertools.count(1)):
4134 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4135 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4136 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4137 if begin_time is None:
4138 continue
4139 if not end_time:
4140 if not dur:
4141 continue
4142 end_time = begin_time + dur
4143 out.append('%d\n%s --> %s\n%s\n\n' % (
4144 index,
4145 srt_subtitles_timecode(begin_time),
4146 srt_subtitles_timecode(end_time),
4147 parse_node(para)))
4148
4149 return ''.join(out)
4150
4151
4152 def cli_option(params, command_option, param, separator=None):
4153 param = params.get(param)
4154 return ([] if param is None
4155 else [command_option, str(param)] if separator is None
4156 else [f'{command_option}{separator}{param}'])
4157
4158
4159 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4160 param = params.get(param)
4161 assert param in (True, False, None)
4162 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4163
4164
4165 def cli_valueless_option(params, command_option, param, expected_value=True):
4166 return [command_option] if params.get(param) == expected_value else []
4167
4168
4169 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4170 if isinstance(argdict, (list, tuple)): # for backward compatibility
4171 if use_compat:
4172 return argdict
4173 else:
4174 argdict = None
4175 if argdict is None:
4176 return default
4177 assert isinstance(argdict, dict)
4178
4179 assert isinstance(keys, (list, tuple))
4180 for key_list in keys:
4181 arg_list = list(filter(
4182 lambda x: x is not None,
4183 [argdict.get(key.lower()) for key in variadic(key_list)]))
4184 if arg_list:
4185 return [arg for args in arg_list for arg in args]
4186 return default
4187
4188
4189 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4190 main_key, exe = main_key.lower(), exe.lower()
4191 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4192 keys = [f'{root_key}{k}' for k in (keys or [''])]
4193 if root_key in keys:
4194 if main_key != exe:
4195 keys.append((main_key, exe))
4196 keys.append('default')
4197 else:
4198 use_compat = False
4199 return cli_configuration_args(argdict, keys, default, use_compat)
4200
4201
4202 class ISO639Utils:
4203 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4204 _lang_map = {
4205 'aa': 'aar',
4206 'ab': 'abk',
4207 'ae': 'ave',
4208 'af': 'afr',
4209 'ak': 'aka',
4210 'am': 'amh',
4211 'an': 'arg',
4212 'ar': 'ara',
4213 'as': 'asm',
4214 'av': 'ava',
4215 'ay': 'aym',
4216 'az': 'aze',
4217 'ba': 'bak',
4218 'be': 'bel',
4219 'bg': 'bul',
4220 'bh': 'bih',
4221 'bi': 'bis',
4222 'bm': 'bam',
4223 'bn': 'ben',
4224 'bo': 'bod',
4225 'br': 'bre',
4226 'bs': 'bos',
4227 'ca': 'cat',
4228 'ce': 'che',
4229 'ch': 'cha',
4230 'co': 'cos',
4231 'cr': 'cre',
4232 'cs': 'ces',
4233 'cu': 'chu',
4234 'cv': 'chv',
4235 'cy': 'cym',
4236 'da': 'dan',
4237 'de': 'deu',
4238 'dv': 'div',
4239 'dz': 'dzo',
4240 'ee': 'ewe',
4241 'el': 'ell',
4242 'en': 'eng',
4243 'eo': 'epo',
4244 'es': 'spa',
4245 'et': 'est',
4246 'eu': 'eus',
4247 'fa': 'fas',
4248 'ff': 'ful',
4249 'fi': 'fin',
4250 'fj': 'fij',
4251 'fo': 'fao',
4252 'fr': 'fra',
4253 'fy': 'fry',
4254 'ga': 'gle',
4255 'gd': 'gla',
4256 'gl': 'glg',
4257 'gn': 'grn',
4258 'gu': 'guj',
4259 'gv': 'glv',
4260 'ha': 'hau',
4261 'he': 'heb',
4262 'iw': 'heb', # Replaced by he in 1989 revision
4263 'hi': 'hin',
4264 'ho': 'hmo',
4265 'hr': 'hrv',
4266 'ht': 'hat',
4267 'hu': 'hun',
4268 'hy': 'hye',
4269 'hz': 'her',
4270 'ia': 'ina',
4271 'id': 'ind',
4272 'in': 'ind', # Replaced by id in 1989 revision
4273 'ie': 'ile',
4274 'ig': 'ibo',
4275 'ii': 'iii',
4276 'ik': 'ipk',
4277 'io': 'ido',
4278 'is': 'isl',
4279 'it': 'ita',
4280 'iu': 'iku',
4281 'ja': 'jpn',
4282 'jv': 'jav',
4283 'ka': 'kat',
4284 'kg': 'kon',
4285 'ki': 'kik',
4286 'kj': 'kua',
4287 'kk': 'kaz',
4288 'kl': 'kal',
4289 'km': 'khm',
4290 'kn': 'kan',
4291 'ko': 'kor',
4292 'kr': 'kau',
4293 'ks': 'kas',
4294 'ku': 'kur',
4295 'kv': 'kom',
4296 'kw': 'cor',
4297 'ky': 'kir',
4298 'la': 'lat',
4299 'lb': 'ltz',
4300 'lg': 'lug',
4301 'li': 'lim',
4302 'ln': 'lin',
4303 'lo': 'lao',
4304 'lt': 'lit',
4305 'lu': 'lub',
4306 'lv': 'lav',
4307 'mg': 'mlg',
4308 'mh': 'mah',
4309 'mi': 'mri',
4310 'mk': 'mkd',
4311 'ml': 'mal',
4312 'mn': 'mon',
4313 'mr': 'mar',
4314 'ms': 'msa',
4315 'mt': 'mlt',
4316 'my': 'mya',
4317 'na': 'nau',
4318 'nb': 'nob',
4319 'nd': 'nde',
4320 'ne': 'nep',
4321 'ng': 'ndo',
4322 'nl': 'nld',
4323 'nn': 'nno',
4324 'no': 'nor',
4325 'nr': 'nbl',
4326 'nv': 'nav',
4327 'ny': 'nya',
4328 'oc': 'oci',
4329 'oj': 'oji',
4330 'om': 'orm',
4331 'or': 'ori',
4332 'os': 'oss',
4333 'pa': 'pan',
4334 'pi': 'pli',
4335 'pl': 'pol',
4336 'ps': 'pus',
4337 'pt': 'por',
4338 'qu': 'que',
4339 'rm': 'roh',
4340 'rn': 'run',
4341 'ro': 'ron',
4342 'ru': 'rus',
4343 'rw': 'kin',
4344 'sa': 'san',
4345 'sc': 'srd',
4346 'sd': 'snd',
4347 'se': 'sme',
4348 'sg': 'sag',
4349 'si': 'sin',
4350 'sk': 'slk',
4351 'sl': 'slv',
4352 'sm': 'smo',
4353 'sn': 'sna',
4354 'so': 'som',
4355 'sq': 'sqi',
4356 'sr': 'srp',
4357 'ss': 'ssw',
4358 'st': 'sot',
4359 'su': 'sun',
4360 'sv': 'swe',
4361 'sw': 'swa',
4362 'ta': 'tam',
4363 'te': 'tel',
4364 'tg': 'tgk',
4365 'th': 'tha',
4366 'ti': 'tir',
4367 'tk': 'tuk',
4368 'tl': 'tgl',
4369 'tn': 'tsn',
4370 'to': 'ton',
4371 'tr': 'tur',
4372 'ts': 'tso',
4373 'tt': 'tat',
4374 'tw': 'twi',
4375 'ty': 'tah',
4376 'ug': 'uig',
4377 'uk': 'ukr',
4378 'ur': 'urd',
4379 'uz': 'uzb',
4380 've': 'ven',
4381 'vi': 'vie',
4382 'vo': 'vol',
4383 'wa': 'wln',
4384 'wo': 'wol',
4385 'xh': 'xho',
4386 'yi': 'yid',
4387 'ji': 'yid', # Replaced by yi in 1989 revision
4388 'yo': 'yor',
4389 'za': 'zha',
4390 'zh': 'zho',
4391 'zu': 'zul',
4392 }
4393
4394 @classmethod
4395 def short2long(cls, code):
4396 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4397 return cls._lang_map.get(code[:2])
4398
4399 @classmethod
4400 def long2short(cls, code):
4401 """Convert language code from ISO 639-2/T to ISO 639-1"""
4402 for short_name, long_name in cls._lang_map.items():
4403 if long_name == code:
4404 return short_name
4405
4406
4407 class ISO3166Utils:
4408 # From http://data.okfn.org/data/core/country-list
4409 _country_map = {
4410 'AF': 'Afghanistan',
4411 'AX': 'Åland Islands',
4412 'AL': 'Albania',
4413 'DZ': 'Algeria',
4414 'AS': 'American Samoa',
4415 'AD': 'Andorra',
4416 'AO': 'Angola',
4417 'AI': 'Anguilla',
4418 'AQ': 'Antarctica',
4419 'AG': 'Antigua and Barbuda',
4420 'AR': 'Argentina',
4421 'AM': 'Armenia',
4422 'AW': 'Aruba',
4423 'AU': 'Australia',
4424 'AT': 'Austria',
4425 'AZ': 'Azerbaijan',
4426 'BS': 'Bahamas',
4427 'BH': 'Bahrain',
4428 'BD': 'Bangladesh',
4429 'BB': 'Barbados',
4430 'BY': 'Belarus',
4431 'BE': 'Belgium',
4432 'BZ': 'Belize',
4433 'BJ': 'Benin',
4434 'BM': 'Bermuda',
4435 'BT': 'Bhutan',
4436 'BO': 'Bolivia, Plurinational State of',
4437 'BQ': 'Bonaire, Sint Eustatius and Saba',
4438 'BA': 'Bosnia and Herzegovina',
4439 'BW': 'Botswana',
4440 'BV': 'Bouvet Island',
4441 'BR': 'Brazil',
4442 'IO': 'British Indian Ocean Territory',
4443 'BN': 'Brunei Darussalam',
4444 'BG': 'Bulgaria',
4445 'BF': 'Burkina Faso',
4446 'BI': 'Burundi',
4447 'KH': 'Cambodia',
4448 'CM': 'Cameroon',
4449 'CA': 'Canada',
4450 'CV': 'Cape Verde',
4451 'KY': 'Cayman Islands',
4452 'CF': 'Central African Republic',
4453 'TD': 'Chad',
4454 'CL': 'Chile',
4455 'CN': 'China',
4456 'CX': 'Christmas Island',
4457 'CC': 'Cocos (Keeling) Islands',
4458 'CO': 'Colombia',
4459 'KM': 'Comoros',
4460 'CG': 'Congo',
4461 'CD': 'Congo, the Democratic Republic of the',
4462 'CK': 'Cook Islands',
4463 'CR': 'Costa Rica',
4464 'CI': 'Côte d\'Ivoire',
4465 'HR': 'Croatia',
4466 'CU': 'Cuba',
4467 'CW': 'Curaçao',
4468 'CY': 'Cyprus',
4469 'CZ': 'Czech Republic',
4470 'DK': 'Denmark',
4471 'DJ': 'Djibouti',
4472 'DM': 'Dominica',
4473 'DO': 'Dominican Republic',
4474 'EC': 'Ecuador',
4475 'EG': 'Egypt',
4476 'SV': 'El Salvador',
4477 'GQ': 'Equatorial Guinea',
4478 'ER': 'Eritrea',
4479 'EE': 'Estonia',
4480 'ET': 'Ethiopia',
4481 'FK': 'Falkland Islands (Malvinas)',
4482 'FO': 'Faroe Islands',
4483 'FJ': 'Fiji',
4484 'FI': 'Finland',
4485 'FR': 'France',
4486 'GF': 'French Guiana',
4487 'PF': 'French Polynesia',
4488 'TF': 'French Southern Territories',
4489 'GA': 'Gabon',
4490 'GM': 'Gambia',
4491 'GE': 'Georgia',
4492 'DE': 'Germany',
4493 'GH': 'Ghana',
4494 'GI': 'Gibraltar',
4495 'GR': 'Greece',
4496 'GL': 'Greenland',
4497 'GD': 'Grenada',
4498 'GP': 'Guadeloupe',
4499 'GU': 'Guam',
4500 'GT': 'Guatemala',
4501 'GG': 'Guernsey',
4502 'GN': 'Guinea',
4503 'GW': 'Guinea-Bissau',
4504 'GY': 'Guyana',
4505 'HT': 'Haiti',
4506 'HM': 'Heard Island and McDonald Islands',
4507 'VA': 'Holy See (Vatican City State)',
4508 'HN': 'Honduras',
4509 'HK': 'Hong Kong',
4510 'HU': 'Hungary',
4511 'IS': 'Iceland',
4512 'IN': 'India',
4513 'ID': 'Indonesia',
4514 'IR': 'Iran, Islamic Republic of',
4515 'IQ': 'Iraq',
4516 'IE': 'Ireland',
4517 'IM': 'Isle of Man',
4518 'IL': 'Israel',
4519 'IT': 'Italy',
4520 'JM': 'Jamaica',
4521 'JP': 'Japan',
4522 'JE': 'Jersey',
4523 'JO': 'Jordan',
4524 'KZ': 'Kazakhstan',
4525 'KE': 'Kenya',
4526 'KI': 'Kiribati',
4527 'KP': 'Korea, Democratic People\'s Republic of',
4528 'KR': 'Korea, Republic of',
4529 'KW': 'Kuwait',
4530 'KG': 'Kyrgyzstan',
4531 'LA': 'Lao People\'s Democratic Republic',
4532 'LV': 'Latvia',
4533 'LB': 'Lebanon',
4534 'LS': 'Lesotho',
4535 'LR': 'Liberia',
4536 'LY': 'Libya',
4537 'LI': 'Liechtenstein',
4538 'LT': 'Lithuania',
4539 'LU': 'Luxembourg',
4540 'MO': 'Macao',
4541 'MK': 'Macedonia, the Former Yugoslav Republic of',
4542 'MG': 'Madagascar',
4543 'MW': 'Malawi',
4544 'MY': 'Malaysia',
4545 'MV': 'Maldives',
4546 'ML': 'Mali',
4547 'MT': 'Malta',
4548 'MH': 'Marshall Islands',
4549 'MQ': 'Martinique',
4550 'MR': 'Mauritania',
4551 'MU': 'Mauritius',
4552 'YT': 'Mayotte',
4553 'MX': 'Mexico',
4554 'FM': 'Micronesia, Federated States of',
4555 'MD': 'Moldova, Republic of',
4556 'MC': 'Monaco',
4557 'MN': 'Mongolia',
4558 'ME': 'Montenegro',
4559 'MS': 'Montserrat',
4560 'MA': 'Morocco',
4561 'MZ': 'Mozambique',
4562 'MM': 'Myanmar',
4563 'NA': 'Namibia',
4564 'NR': 'Nauru',
4565 'NP': 'Nepal',
4566 'NL': 'Netherlands',
4567 'NC': 'New Caledonia',
4568 'NZ': 'New Zealand',
4569 'NI': 'Nicaragua',
4570 'NE': 'Niger',
4571 'NG': 'Nigeria',
4572 'NU': 'Niue',
4573 'NF': 'Norfolk Island',
4574 'MP': 'Northern Mariana Islands',
4575 'NO': 'Norway',
4576 'OM': 'Oman',
4577 'PK': 'Pakistan',
4578 'PW': 'Palau',
4579 'PS': 'Palestine, State of',
4580 'PA': 'Panama',
4581 'PG': 'Papua New Guinea',
4582 'PY': 'Paraguay',
4583 'PE': 'Peru',
4584 'PH': 'Philippines',
4585 'PN': 'Pitcairn',
4586 'PL': 'Poland',
4587 'PT': 'Portugal',
4588 'PR': 'Puerto Rico',
4589 'QA': 'Qatar',
4590 'RE': 'Réunion',
4591 'RO': 'Romania',
4592 'RU': 'Russian Federation',
4593 'RW': 'Rwanda',
4594 'BL': 'Saint Barthélemy',
4595 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4596 'KN': 'Saint Kitts and Nevis',
4597 'LC': 'Saint Lucia',
4598 'MF': 'Saint Martin (French part)',
4599 'PM': 'Saint Pierre and Miquelon',
4600 'VC': 'Saint Vincent and the Grenadines',
4601 'WS': 'Samoa',
4602 'SM': 'San Marino',
4603 'ST': 'Sao Tome and Principe',
4604 'SA': 'Saudi Arabia',
4605 'SN': 'Senegal',
4606 'RS': 'Serbia',
4607 'SC': 'Seychelles',
4608 'SL': 'Sierra Leone',
4609 'SG': 'Singapore',
4610 'SX': 'Sint Maarten (Dutch part)',
4611 'SK': 'Slovakia',
4612 'SI': 'Slovenia',
4613 'SB': 'Solomon Islands',
4614 'SO': 'Somalia',
4615 'ZA': 'South Africa',
4616 'GS': 'South Georgia and the South Sandwich Islands',
4617 'SS': 'South Sudan',
4618 'ES': 'Spain',
4619 'LK': 'Sri Lanka',
4620 'SD': 'Sudan',
4621 'SR': 'Suriname',
4622 'SJ': 'Svalbard and Jan Mayen',
4623 'SZ': 'Swaziland',
4624 'SE': 'Sweden',
4625 'CH': 'Switzerland',
4626 'SY': 'Syrian Arab Republic',
4627 'TW': 'Taiwan, Province of China',
4628 'TJ': 'Tajikistan',
4629 'TZ': 'Tanzania, United Republic of',
4630 'TH': 'Thailand',
4631 'TL': 'Timor-Leste',
4632 'TG': 'Togo',
4633 'TK': 'Tokelau',
4634 'TO': 'Tonga',
4635 'TT': 'Trinidad and Tobago',
4636 'TN': 'Tunisia',
4637 'TR': 'Turkey',
4638 'TM': 'Turkmenistan',
4639 'TC': 'Turks and Caicos Islands',
4640 'TV': 'Tuvalu',
4641 'UG': 'Uganda',
4642 'UA': 'Ukraine',
4643 'AE': 'United Arab Emirates',
4644 'GB': 'United Kingdom',
4645 'US': 'United States',
4646 'UM': 'United States Minor Outlying Islands',
4647 'UY': 'Uruguay',
4648 'UZ': 'Uzbekistan',
4649 'VU': 'Vanuatu',
4650 'VE': 'Venezuela, Bolivarian Republic of',
4651 'VN': 'Viet Nam',
4652 'VG': 'Virgin Islands, British',
4653 'VI': 'Virgin Islands, U.S.',
4654 'WF': 'Wallis and Futuna',
4655 'EH': 'Western Sahara',
4656 'YE': 'Yemen',
4657 'ZM': 'Zambia',
4658 'ZW': 'Zimbabwe',
4659 # Not ISO 3166 codes, but used for IP blocks
4660 'AP': 'Asia/Pacific Region',
4661 'EU': 'Europe',
4662 }
4663
4664 @classmethod
4665 def short2full(cls, code):
4666 """Convert an ISO 3166-2 country code to the corresponding full name"""
4667 return cls._country_map.get(code.upper())
4668
4669
4670 class GeoUtils:
4671 # Major IPv4 address blocks per country
4672 _country_ip_map = {
4673 'AD': '46.172.224.0/19',
4674 'AE': '94.200.0.0/13',
4675 'AF': '149.54.0.0/17',
4676 'AG': '209.59.64.0/18',
4677 'AI': '204.14.248.0/21',
4678 'AL': '46.99.0.0/16',
4679 'AM': '46.70.0.0/15',
4680 'AO': '105.168.0.0/13',
4681 'AP': '182.50.184.0/21',
4682 'AQ': '23.154.160.0/24',
4683 'AR': '181.0.0.0/12',
4684 'AS': '202.70.112.0/20',
4685 'AT': '77.116.0.0/14',
4686 'AU': '1.128.0.0/11',
4687 'AW': '181.41.0.0/18',
4688 'AX': '185.217.4.0/22',
4689 'AZ': '5.197.0.0/16',
4690 'BA': '31.176.128.0/17',
4691 'BB': '65.48.128.0/17',
4692 'BD': '114.130.0.0/16',
4693 'BE': '57.0.0.0/8',
4694 'BF': '102.178.0.0/15',
4695 'BG': '95.42.0.0/15',
4696 'BH': '37.131.0.0/17',
4697 'BI': '154.117.192.0/18',
4698 'BJ': '137.255.0.0/16',
4699 'BL': '185.212.72.0/23',
4700 'BM': '196.12.64.0/18',
4701 'BN': '156.31.0.0/16',
4702 'BO': '161.56.0.0/16',
4703 'BQ': '161.0.80.0/20',
4704 'BR': '191.128.0.0/12',
4705 'BS': '24.51.64.0/18',
4706 'BT': '119.2.96.0/19',
4707 'BW': '168.167.0.0/16',
4708 'BY': '178.120.0.0/13',
4709 'BZ': '179.42.192.0/18',
4710 'CA': '99.224.0.0/11',
4711 'CD': '41.243.0.0/16',
4712 'CF': '197.242.176.0/21',
4713 'CG': '160.113.0.0/16',
4714 'CH': '85.0.0.0/13',
4715 'CI': '102.136.0.0/14',
4716 'CK': '202.65.32.0/19',
4717 'CL': '152.172.0.0/14',
4718 'CM': '102.244.0.0/14',
4719 'CN': '36.128.0.0/10',
4720 'CO': '181.240.0.0/12',
4721 'CR': '201.192.0.0/12',
4722 'CU': '152.206.0.0/15',
4723 'CV': '165.90.96.0/19',
4724 'CW': '190.88.128.0/17',
4725 'CY': '31.153.0.0/16',
4726 'CZ': '88.100.0.0/14',
4727 'DE': '53.0.0.0/8',
4728 'DJ': '197.241.0.0/17',
4729 'DK': '87.48.0.0/12',
4730 'DM': '192.243.48.0/20',
4731 'DO': '152.166.0.0/15',
4732 'DZ': '41.96.0.0/12',
4733 'EC': '186.68.0.0/15',
4734 'EE': '90.190.0.0/15',
4735 'EG': '156.160.0.0/11',
4736 'ER': '196.200.96.0/20',
4737 'ES': '88.0.0.0/11',
4738 'ET': '196.188.0.0/14',
4739 'EU': '2.16.0.0/13',
4740 'FI': '91.152.0.0/13',
4741 'FJ': '144.120.0.0/16',
4742 'FK': '80.73.208.0/21',
4743 'FM': '119.252.112.0/20',
4744 'FO': '88.85.32.0/19',
4745 'FR': '90.0.0.0/9',
4746 'GA': '41.158.0.0/15',
4747 'GB': '25.0.0.0/8',
4748 'GD': '74.122.88.0/21',
4749 'GE': '31.146.0.0/16',
4750 'GF': '161.22.64.0/18',
4751 'GG': '62.68.160.0/19',
4752 'GH': '154.160.0.0/12',
4753 'GI': '95.164.0.0/16',
4754 'GL': '88.83.0.0/19',
4755 'GM': '160.182.0.0/15',
4756 'GN': '197.149.192.0/18',
4757 'GP': '104.250.0.0/19',
4758 'GQ': '105.235.224.0/20',
4759 'GR': '94.64.0.0/13',
4760 'GT': '168.234.0.0/16',
4761 'GU': '168.123.0.0/16',
4762 'GW': '197.214.80.0/20',
4763 'GY': '181.41.64.0/18',
4764 'HK': '113.252.0.0/14',
4765 'HN': '181.210.0.0/16',
4766 'HR': '93.136.0.0/13',
4767 'HT': '148.102.128.0/17',
4768 'HU': '84.0.0.0/14',
4769 'ID': '39.192.0.0/10',
4770 'IE': '87.32.0.0/12',
4771 'IL': '79.176.0.0/13',
4772 'IM': '5.62.80.0/20',
4773 'IN': '117.192.0.0/10',
4774 'IO': '203.83.48.0/21',
4775 'IQ': '37.236.0.0/14',
4776 'IR': '2.176.0.0/12',
4777 'IS': '82.221.0.0/16',
4778 'IT': '79.0.0.0/10',
4779 'JE': '87.244.64.0/18',
4780 'JM': '72.27.0.0/17',
4781 'JO': '176.29.0.0/16',
4782 'JP': '133.0.0.0/8',
4783 'KE': '105.48.0.0/12',
4784 'KG': '158.181.128.0/17',
4785 'KH': '36.37.128.0/17',
4786 'KI': '103.25.140.0/22',
4787 'KM': '197.255.224.0/20',
4788 'KN': '198.167.192.0/19',
4789 'KP': '175.45.176.0/22',
4790 'KR': '175.192.0.0/10',
4791 'KW': '37.36.0.0/14',
4792 'KY': '64.96.0.0/15',
4793 'KZ': '2.72.0.0/13',
4794 'LA': '115.84.64.0/18',
4795 'LB': '178.135.0.0/16',
4796 'LC': '24.92.144.0/20',
4797 'LI': '82.117.0.0/19',
4798 'LK': '112.134.0.0/15',
4799 'LR': '102.183.0.0/16',
4800 'LS': '129.232.0.0/17',
4801 'LT': '78.56.0.0/13',
4802 'LU': '188.42.0.0/16',
4803 'LV': '46.109.0.0/16',
4804 'LY': '41.252.0.0/14',
4805 'MA': '105.128.0.0/11',
4806 'MC': '88.209.64.0/18',
4807 'MD': '37.246.0.0/16',
4808 'ME': '178.175.0.0/17',
4809 'MF': '74.112.232.0/21',
4810 'MG': '154.126.0.0/17',
4811 'MH': '117.103.88.0/21',
4812 'MK': '77.28.0.0/15',
4813 'ML': '154.118.128.0/18',
4814 'MM': '37.111.0.0/17',
4815 'MN': '49.0.128.0/17',
4816 'MO': '60.246.0.0/16',
4817 'MP': '202.88.64.0/20',
4818 'MQ': '109.203.224.0/19',
4819 'MR': '41.188.64.0/18',
4820 'MS': '208.90.112.0/22',
4821 'MT': '46.11.0.0/16',
4822 'MU': '105.16.0.0/12',
4823 'MV': '27.114.128.0/18',
4824 'MW': '102.70.0.0/15',
4825 'MX': '187.192.0.0/11',
4826 'MY': '175.136.0.0/13',
4827 'MZ': '197.218.0.0/15',
4828 'NA': '41.182.0.0/16',
4829 'NC': '101.101.0.0/18',
4830 'NE': '197.214.0.0/18',
4831 'NF': '203.17.240.0/22',
4832 'NG': '105.112.0.0/12',
4833 'NI': '186.76.0.0/15',
4834 'NL': '145.96.0.0/11',
4835 'NO': '84.208.0.0/13',
4836 'NP': '36.252.0.0/15',
4837 'NR': '203.98.224.0/19',
4838 'NU': '49.156.48.0/22',
4839 'NZ': '49.224.0.0/14',
4840 'OM': '5.36.0.0/15',
4841 'PA': '186.72.0.0/15',
4842 'PE': '186.160.0.0/14',
4843 'PF': '123.50.64.0/18',
4844 'PG': '124.240.192.0/19',
4845 'PH': '49.144.0.0/13',
4846 'PK': '39.32.0.0/11',
4847 'PL': '83.0.0.0/11',
4848 'PM': '70.36.0.0/20',
4849 'PR': '66.50.0.0/16',
4850 'PS': '188.161.0.0/16',
4851 'PT': '85.240.0.0/13',
4852 'PW': '202.124.224.0/20',
4853 'PY': '181.120.0.0/14',
4854 'QA': '37.210.0.0/15',
4855 'RE': '102.35.0.0/16',
4856 'RO': '79.112.0.0/13',
4857 'RS': '93.86.0.0/15',
4858 'RU': '5.136.0.0/13',
4859 'RW': '41.186.0.0/16',
4860 'SA': '188.48.0.0/13',
4861 'SB': '202.1.160.0/19',
4862 'SC': '154.192.0.0/11',
4863 'SD': '102.120.0.0/13',
4864 'SE': '78.64.0.0/12',
4865 'SG': '8.128.0.0/10',
4866 'SI': '188.196.0.0/14',
4867 'SK': '78.98.0.0/15',
4868 'SL': '102.143.0.0/17',
4869 'SM': '89.186.32.0/19',
4870 'SN': '41.82.0.0/15',
4871 'SO': '154.115.192.0/18',
4872 'SR': '186.179.128.0/17',
4873 'SS': '105.235.208.0/21',
4874 'ST': '197.159.160.0/19',
4875 'SV': '168.243.0.0/16',
4876 'SX': '190.102.0.0/20',
4877 'SY': '5.0.0.0/16',
4878 'SZ': '41.84.224.0/19',
4879 'TC': '65.255.48.0/20',
4880 'TD': '154.68.128.0/19',
4881 'TG': '196.168.0.0/14',
4882 'TH': '171.96.0.0/13',
4883 'TJ': '85.9.128.0/18',
4884 'TK': '27.96.24.0/21',
4885 'TL': '180.189.160.0/20',
4886 'TM': '95.85.96.0/19',
4887 'TN': '197.0.0.0/11',
4888 'TO': '175.176.144.0/21',
4889 'TR': '78.160.0.0/11',
4890 'TT': '186.44.0.0/15',
4891 'TV': '202.2.96.0/19',
4892 'TW': '120.96.0.0/11',
4893 'TZ': '156.156.0.0/14',
4894 'UA': '37.52.0.0/14',
4895 'UG': '102.80.0.0/13',
4896 'US': '6.0.0.0/8',
4897 'UY': '167.56.0.0/13',
4898 'UZ': '84.54.64.0/18',
4899 'VA': '212.77.0.0/19',
4900 'VC': '207.191.240.0/21',
4901 'VE': '186.88.0.0/13',
4902 'VG': '66.81.192.0/20',
4903 'VI': '146.226.0.0/16',
4904 'VN': '14.160.0.0/11',
4905 'VU': '202.80.32.0/20',
4906 'WF': '117.20.32.0/21',
4907 'WS': '202.4.32.0/19',
4908 'YE': '134.35.0.0/16',
4909 'YT': '41.242.116.0/22',
4910 'ZA': '41.0.0.0/11',
4911 'ZM': '102.144.0.0/13',
4912 'ZW': '102.177.192.0/18',
4913 }
4914
4915 @classmethod
4916 def random_ipv4(cls, code_or_block):
4917 if len(code_or_block) == 2:
4918 block = cls._country_ip_map.get(code_or_block.upper())
4919 if not block:
4920 return None
4921 else:
4922 block = code_or_block
4923 addr, preflen = block.split('/')
4924 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4925 addr_max = addr_min | (0xffffffff >> int(preflen))
4926 return str(socket.inet_ntoa(
4927 struct.pack('!L', random.randint(addr_min, addr_max))))
4928
4929
4930 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4931 def __init__(self, proxies=None):
4932 # Set default handlers
4933 for type in ('http', 'https'):
4934 setattr(self, '%s_open' % type,
4935 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4936 meth(r, proxy, type))
4937 urllib.request.ProxyHandler.__init__(self, proxies)
4938
4939 def proxy_open(self, req, proxy, type):
4940 req_proxy = req.headers.get('Ytdl-request-proxy')
4941 if req_proxy is not None:
4942 proxy = req_proxy
4943 del req.headers['Ytdl-request-proxy']
4944
4945 if proxy == '__noproxy__':
4946 return None # No Proxy
4947 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4948 req.add_header('Ytdl-socks-proxy', proxy)
4949 # yt-dlp's http/https handlers do wrapping the socket with socks
4950 return None
4951 return urllib.request.ProxyHandler.proxy_open(
4952 self, req, proxy, type)
4953
4954
4955 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4956 # released into Public Domain
4957 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4958
4959 def long_to_bytes(n, blocksize=0):
4960 """long_to_bytes(n:long, blocksize:int) : string
4961 Convert a long integer to a byte string.
4962
4963 If optional blocksize is given and greater than zero, pad the front of the
4964 byte string with binary zeros so that the length is a multiple of
4965 blocksize.
4966 """
4967 # after much testing, this algorithm was deemed to be the fastest
4968 s = b''
4969 n = int(n)
4970 while n > 0:
4971 s = struct.pack('>I', n & 0xffffffff) + s
4972 n = n >> 32
4973 # strip off leading zeros
4974 for i in range(len(s)):
4975 if s[i] != b'\000'[0]:
4976 break
4977 else:
4978 # only happens when n == 0
4979 s = b'\000'
4980 i = 0
4981 s = s[i:]
4982 # add back some pad bytes. this could be done more efficiently w.r.t. the
4983 # de-padding being done above, but sigh...
4984 if blocksize > 0 and len(s) % blocksize:
4985 s = (blocksize - len(s) % blocksize) * b'\000' + s
4986 return s
4987
4988
4989 def bytes_to_long(s):
4990 """bytes_to_long(string) : long
4991 Convert a byte string to a long integer.
4992
4993 This is (essentially) the inverse of long_to_bytes().
4994 """
4995 acc = 0
4996 length = len(s)
4997 if length % 4:
4998 extra = (4 - length % 4)
4999 s = b'\000' * extra + s
5000 length = length + extra
5001 for i in range(0, length, 4):
5002 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
5003 return acc
5004
5005
5006 def ohdave_rsa_encrypt(data, exponent, modulus):
5007 '''
5008 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
5009
5010 Input:
5011 data: data to encrypt, bytes-like object
5012 exponent, modulus: parameter e and N of RSA algorithm, both integer
5013 Output: hex string of encrypted data
5014
5015 Limitation: supports one block encryption only
5016 '''
5017
5018 payload = int(binascii.hexlify(data[::-1]), 16)
5019 encrypted = pow(payload, exponent, modulus)
5020 return '%x' % encrypted
5021
5022
5023 def pkcs1pad(data, length):
5024 """
5025 Padding input data with PKCS#1 scheme
5026
5027 @param {int[]} data input data
5028 @param {int} length target length
5029 @returns {int[]} padded data
5030 """
5031 if len(data) > length - 11:
5032 raise ValueError('Input data too long for PKCS#1 padding')
5033
5034 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
5035 return [0, 2] + pseudo_random + [0] + data
5036
5037
5038 def _base_n_table(n, table):
5039 if not table and not n:
5040 raise ValueError('Either table or n must be specified')
5041 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
5042
5043 if n and n != len(table):
5044 raise ValueError(f'base {n} exceeds table length {len(table)}')
5045 return table
5046
5047
5048 def encode_base_n(num, n=None, table=None):
5049 """Convert given int to a base-n string"""
5050 table = _base_n_table(n, table)
5051 if not num:
5052 return table[0]
5053
5054 result, base = '', len(table)
5055 while num:
5056 result = table[num % base] + result
5057 num = num // base
5058 return result
5059
5060
5061 def decode_base_n(string, n=None, table=None):
5062 """Convert given base-n string to int"""
5063 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5064 result, base = 0, len(table)
5065 for char in string:
5066 result = result * base + table[char]
5067 return result
5068
5069
5070 def decode_base(value, digits):
5071 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
5072 f'in a future version. Use {__name__}.decode_base_n instead')
5073 return decode_base_n(value, table=digits)
5074
5075
5076 def decode_packed_codes(code):
5077 mobj = re.search(PACKED_CODES_RE, code)
5078 obfuscated_code, base, count, symbols = mobj.groups()
5079 base = int(base)
5080 count = int(count)
5081 symbols = symbols.split('|')
5082 symbol_table = {}
5083
5084 while count:
5085 count -= 1
5086 base_n_count = encode_base_n(count, base)
5087 symbol_table[base_n_count] = symbols[count] or base_n_count
5088
5089 return re.sub(
5090 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5091 obfuscated_code)
5092
5093
5094 def caesar(s, alphabet, shift):
5095 if shift == 0:
5096 return s
5097 l = len(alphabet)
5098 return ''.join(
5099 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5100 for c in s)
5101
5102
5103 def rot47(s):
5104 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5105
5106
5107 def parse_m3u8_attributes(attrib):
5108 info = {}
5109 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5110 if val.startswith('"'):
5111 val = val[1:-1]
5112 info[key] = val
5113 return info
5114
5115
5116 def urshift(val, n):
5117 return val >> n if val >= 0 else (val + 0x100000000) >> n
5118
5119
5120 # Based on png2str() written by @gdkchan and improved by @yokrysty
5121 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5122 def decode_png(png_data):
5123 # Reference: https://www.w3.org/TR/PNG/
5124 header = png_data[8:]
5125
5126 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5127 raise OSError('Not a valid PNG file.')
5128
5129 int_map = {1: '>B', 2: '>H', 4: '>I'}
5130 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
5131
5132 chunks = []
5133
5134 while header:
5135 length = unpack_integer(header[:4])
5136 header = header[4:]
5137
5138 chunk_type = header[:4]
5139 header = header[4:]
5140
5141 chunk_data = header[:length]
5142 header = header[length:]
5143
5144 header = header[4:] # Skip CRC
5145
5146 chunks.append({
5147 'type': chunk_type,
5148 'length': length,
5149 'data': chunk_data
5150 })
5151
5152 ihdr = chunks[0]['data']
5153
5154 width = unpack_integer(ihdr[:4])
5155 height = unpack_integer(ihdr[4:8])
5156
5157 idat = b''
5158
5159 for chunk in chunks:
5160 if chunk['type'] == b'IDAT':
5161 idat += chunk['data']
5162
5163 if not idat:
5164 raise OSError('Unable to read PNG data.')
5165
5166 decompressed_data = bytearray(zlib.decompress(idat))
5167
5168 stride = width * 3
5169 pixels = []
5170
5171 def _get_pixel(idx):
5172 x = idx % stride
5173 y = idx // stride
5174 return pixels[y][x]
5175
5176 for y in range(height):
5177 basePos = y * (1 + stride)
5178 filter_type = decompressed_data[basePos]
5179
5180 current_row = []
5181
5182 pixels.append(current_row)
5183
5184 for x in range(stride):
5185 color = decompressed_data[1 + basePos + x]
5186 basex = y * stride + x
5187 left = 0
5188 up = 0
5189
5190 if x > 2:
5191 left = _get_pixel(basex - 3)
5192 if y > 0:
5193 up = _get_pixel(basex - stride)
5194
5195 if filter_type == 1: # Sub
5196 color = (color + left) & 0xff
5197 elif filter_type == 2: # Up
5198 color = (color + up) & 0xff
5199 elif filter_type == 3: # Average
5200 color = (color + ((left + up) >> 1)) & 0xff
5201 elif filter_type == 4: # Paeth
5202 a = left
5203 b = up
5204 c = 0
5205
5206 if x > 2 and y > 0:
5207 c = _get_pixel(basex - stride - 3)
5208
5209 p = a + b - c
5210
5211 pa = abs(p - a)
5212 pb = abs(p - b)
5213 pc = abs(p - c)
5214
5215 if pa <= pb and pa <= pc:
5216 color = (color + a) & 0xff
5217 elif pb <= pc:
5218 color = (color + b) & 0xff
5219 else:
5220 color = (color + c) & 0xff
5221
5222 current_row.append(color)
5223
5224 return width, height, pixels
5225
5226
5227 def write_xattr(path, key, value):
5228 # Windows: Write xattrs to NTFS Alternate Data Streams:
5229 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5230 if compat_os_name == 'nt':
5231 assert ':' not in key
5232 assert os.path.exists(path)
5233
5234 try:
5235 with open(f'{path}:{key}', 'wb') as f:
5236 f.write(value)
5237 except OSError as e:
5238 raise XAttrMetadataError(e.errno, e.strerror)
5239 return
5240
5241 # UNIX Method 1. Use xattrs/pyxattrs modules
5242
5243 setxattr = None
5244 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5245 # Unicode arguments are not supported in pyxattr until version 0.5.0
5246 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5247 if version_tuple(xattr.__version__) >= (0, 5, 0):
5248 setxattr = xattr.set
5249 elif xattr:
5250 setxattr = xattr.setxattr
5251
5252 if setxattr:
5253 try:
5254 setxattr(path, key, value)
5255 except OSError as e:
5256 raise XAttrMetadataError(e.errno, e.strerror)
5257 return
5258
5259 # UNIX Method 2. Use setfattr/xattr executables
5260 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5261 else 'xattr' if check_executable('xattr', ['-h']) else None)
5262 if not exe:
5263 raise XAttrUnavailableError(
5264 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5265 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5266
5267 value = value.decode()
5268 try:
5269 _, stderr, returncode = Popen.run(
5270 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5271 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5272 except OSError as e:
5273 raise XAttrMetadataError(e.errno, e.strerror)
5274 if returncode:
5275 raise XAttrMetadataError(returncode, stderr)
5276
5277
5278 def random_birthday(year_field, month_field, day_field):
5279 start_date = datetime.date(1950, 1, 1)
5280 end_date = datetime.date(1995, 12, 31)
5281 offset = random.randint(0, (end_date - start_date).days)
5282 random_date = start_date + datetime.timedelta(offset)
5283 return {
5284 year_field: str(random_date.year),
5285 month_field: str(random_date.month),
5286 day_field: str(random_date.day),
5287 }
5288
5289
5290 def find_available_port(interface=''):
5291 try:
5292 with socket.socket() as sock:
5293 sock.bind((interface, 0))
5294 return sock.getsockname()[1]
5295 except OSError:
5296 return None
5297
5298
5299 # Templates for internet shortcut files, which are plain text files.
5300 DOT_URL_LINK_TEMPLATE = '''\
5301 [InternetShortcut]
5302 URL=%(url)s
5303 '''
5304
5305 DOT_WEBLOC_LINK_TEMPLATE = '''\
5306 <?xml version="1.0" encoding="UTF-8"?>
5307 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5308 <plist version="1.0">
5309 <dict>
5310 \t<key>URL</key>
5311 \t<string>%(url)s</string>
5312 </dict>
5313 </plist>
5314 '''
5315
5316 DOT_DESKTOP_LINK_TEMPLATE = '''\
5317 [Desktop Entry]
5318 Encoding=UTF-8
5319 Name=%(filename)s
5320 Type=Link
5321 URL=%(url)s
5322 Icon=text-html
5323 '''
5324
5325 LINK_TEMPLATES = {
5326 'url': DOT_URL_LINK_TEMPLATE,
5327 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5328 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5329 }
5330
5331
5332 def iri_to_uri(iri):
5333 """
5334 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5335
5336 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5337 """
5338
5339 iri_parts = urllib.parse.urlparse(iri)
5340
5341 if '[' in iri_parts.netloc:
5342 raise ValueError('IPv6 URIs are not, yet, supported.')
5343 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5344
5345 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5346
5347 net_location = ''
5348 if iri_parts.username:
5349 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5350 if iri_parts.password is not None:
5351 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5352 net_location += '@'
5353
5354 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5355 # The 'idna' encoding produces ASCII text.
5356 if iri_parts.port is not None and iri_parts.port != 80:
5357 net_location += ':' + str(iri_parts.port)
5358
5359 return urllib.parse.urlunparse(
5360 (iri_parts.scheme,
5361 net_location,
5362
5363 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5364
5365 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5366 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5367
5368 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5369 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5370
5371 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5372
5373 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5374
5375
5376 def to_high_limit_path(path):
5377 if sys.platform in ['win32', 'cygwin']:
5378 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5379 return '\\\\?\\' + os.path.abspath(path)
5380
5381 return path
5382
5383
5384 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5385 val = traverse_obj(obj, *variadic(field))
5386 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5387 return default
5388 return template % func(val)
5389
5390
5391 def clean_podcast_url(url):
5392 return re.sub(r'''(?x)
5393 (?:
5394 (?:
5395 chtbl\.com/track|
5396 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5397 play\.podtrac\.com
5398 )/[^/]+|
5399 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5400 flex\.acast\.com|
5401 pd(?:
5402 cn\.co| # https://podcorn.com/analytics-prefix/
5403 st\.fm # https://podsights.com/docs/
5404 )/e
5405 )/''', '', url)
5406
5407
5408 _HEX_TABLE = '0123456789abcdef'
5409
5410
5411 def random_uuidv4():
5412 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5413
5414
5415 def make_dir(path, to_screen=None):
5416 try:
5417 dn = os.path.dirname(path)
5418 if dn:
5419 os.makedirs(dn, exist_ok=True)
5420 return True
5421 except OSError as err:
5422 if callable(to_screen) is not None:
5423 to_screen('unable to create directory ' + error_to_compat_str(err))
5424 return False
5425
5426
5427 def get_executable_path():
5428 from .update import _get_variant_and_executable_path
5429
5430 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5431
5432
5433 def get_user_config_dirs(package_name):
5434 # .config (e.g. ~/.config/package_name)
5435 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5436 yield os.path.join(xdg_config_home, package_name)
5437
5438 # appdata (%APPDATA%/package_name)
5439 appdata_dir = os.getenv('appdata')
5440 if appdata_dir:
5441 yield os.path.join(appdata_dir, package_name)
5442
5443 # home (~/.package_name)
5444 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5445
5446
5447 def get_system_config_dirs(package_name):
5448 # /etc/package_name
5449 yield os.path.join('/etc', package_name)
5450
5451
5452 def traverse_obj(
5453 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
5454 casesense=True, is_user_input=False, traverse_string=False):
5455 """
5456 Safely traverse nested `dict`s and `Sequence`s
5457
5458 >>> obj = [{}, {"key": "value"}]
5459 >>> traverse_obj(obj, (1, "key"))
5460 "value"
5461
5462 Each of the provided `paths` is tested and the first producing a valid result will be returned.
5463 The next path will also be tested if the path branched but no results could be found.
5464 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
5465 Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
5466
5467 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5468
5469 The keys in the path can be one of:
5470 - `None`: Return the current object.
5471 - `set`: Requires the only item in the set to be a type or function,
5472 like `{type}`/`{func}`. If a `type`, returns only values
5473 of this type. If a function, returns `func(obj)`.
5474 - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`.
5475 - `slice`: Branch out and return all values in `obj[key]`.
5476 - `Ellipsis`: Branch out and return a list of all values.
5477 - `tuple`/`list`: Branch out and return a list of all matching values.
5478 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5479 - `function`: Branch out and return values filtered by the function.
5480 Read as: `[value for key, value in obj if function(key, value)]`.
5481 For `Sequence`s, `key` is the index of the value.
5482 For `re.Match`es, `key` is the group number (0 = full match)
5483 as well as additionally any group names, if given.
5484 - `dict` Transform the current object and return a matching dict.
5485 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5486
5487 `tuple`, `list`, and `dict` all support nested paths and branches.
5488
5489 @params paths Paths which to traverse by.
5490 @param default Value to return if the paths do not match.
5491 If the last key in the path is a `dict`, it will apply to each value inside
5492 the dict instead, depth first. Try to avoid if using nested `dict` keys.
5493 @param expected_type If a `type`, only accept final values of this type.
5494 If any other callable, try to call the function on each result.
5495 If the last key in the path is a `dict`, it will apply to each value inside
5496 the dict instead, recursively. This does respect branching paths.
5497 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5498 @param casesense If `False`, consider string dictionary keys as case insensitive.
5499
5500 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5501
5502 @param is_user_input Whether the keys are generated from user input.
5503 If `True` strings get converted to `int`/`slice` if needed.
5504 @param traverse_string Whether to traverse into objects as strings.
5505 If `True`, any non-compatible object will first be
5506 converted into a string and then traversed into.
5507 The return value of that path will be a string instead,
5508 not respecting any further branching.
5509
5510
5511 @returns The result of the object traversal.
5512 If successful, `get_all=True`, and the path branches at least once,
5513 then a list of results is returned instead.
5514 If no `default` is given and the last path branches, a `list` of results
5515 is always returned. If a path ends on a `dict` that result will always be a `dict`.
5516 """
5517 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5518 casefold = lambda k: k.casefold() if isinstance(k, str) else k
5519
5520 if isinstance(expected_type, type):
5521 type_test = lambda val: val if isinstance(val, expected_type) else None
5522 else:
5523 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5524
5525 def apply_key(key, obj, is_last):
5526 branching = False
5527 result = None
5528
5529 if obj is None and traverse_string:
5530 pass
5531
5532 elif key is None:
5533 result = obj
5534
5535 elif isinstance(key, set):
5536 assert len(key) == 1, 'Set should only be used to wrap a single item'
5537 item = next(iter(key))
5538 if isinstance(item, type):
5539 if isinstance(obj, item):
5540 result = obj
5541 else:
5542 result = try_call(item, args=(obj,))
5543
5544 elif isinstance(key, (list, tuple)):
5545 branching = True
5546 result = itertools.chain.from_iterable(
5547 apply_path(obj, branch, is_last)[0] for branch in key)
5548
5549 elif key is ...:
5550 branching = True
5551 if isinstance(obj, collections.abc.Mapping):
5552 result = obj.values()
5553 elif is_sequence(obj):
5554 result = obj
5555 elif isinstance(obj, re.Match):
5556 result = obj.groups()
5557 elif traverse_string:
5558 branching = False
5559 result = str(obj)
5560 else:
5561 result = ()
5562
5563 elif callable(key):
5564 branching = True
5565 if isinstance(obj, collections.abc.Mapping):
5566 iter_obj = obj.items()
5567 elif is_sequence(obj):
5568 iter_obj = enumerate(obj)
5569 elif isinstance(obj, re.Match):
5570 iter_obj = itertools.chain(
5571 enumerate((obj.group(), *obj.groups())),
5572 obj.groupdict().items())
5573 elif traverse_string:
5574 branching = False
5575 iter_obj = enumerate(str(obj))
5576 else:
5577 iter_obj = ()
5578
5579 result = (v for k, v in iter_obj if try_call(key, args=(k, v)))
5580 if not branching: # string traversal
5581 result = ''.join(result)
5582
5583 elif isinstance(key, dict):
5584 iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items())
5585 result = {
5586 k: v if v is not None else default for k, v in iter_obj
5587 if v is not None or default is not NO_DEFAULT
5588 } or None
5589
5590 elif isinstance(obj, collections.abc.Mapping):
5591 result = (obj.get(key) if casesense or (key in obj) else
5592 next((v for k, v in obj.items() if casefold(k) == key), None))
5593
5594 elif isinstance(obj, re.Match):
5595 if isinstance(key, int) or casesense:
5596 with contextlib.suppress(IndexError):
5597 result = obj.group(key)
5598
5599 elif isinstance(key, str):
5600 result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5601
5602 elif isinstance(key, (int, slice)):
5603 if is_sequence(obj):
5604 branching = isinstance(key, slice)
5605 with contextlib.suppress(IndexError):
5606 result = obj[key]
5607 elif traverse_string:
5608 with contextlib.suppress(IndexError):
5609 result = str(obj)[key]
5610
5611 return branching, result if branching else (result,)
5612
5613 def lazy_last(iterable):
5614 iterator = iter(iterable)
5615 prev = next(iterator, NO_DEFAULT)
5616 if prev is NO_DEFAULT:
5617 return
5618
5619 for item in iterator:
5620 yield False, prev
5621 prev = item
5622
5623 yield True, prev
5624
5625 def apply_path(start_obj, path, test_type):
5626 objs = (start_obj,)
5627 has_branched = False
5628
5629 key = None
5630 for last, key in lazy_last(variadic(path, (str, bytes, dict, set))):
5631 if is_user_input and isinstance(key, str):
5632 if key == ':':
5633 key = ...
5634 elif ':' in key:
5635 key = slice(*map(int_or_none, key.split(':')))
5636 elif int_or_none(key) is not None:
5637 key = int(key)
5638
5639 if not casesense and isinstance(key, str):
5640 key = key.casefold()
5641
5642 if __debug__ and callable(key):
5643 # Verify function signature
5644 inspect.signature(key).bind(None, None)
5645
5646 new_objs = []
5647 for obj in objs:
5648 branching, results = apply_key(key, obj, last)
5649 has_branched |= branching
5650 new_objs.append(results)
5651
5652 objs = itertools.chain.from_iterable(new_objs)
5653
5654 if test_type and not isinstance(key, (dict, list, tuple)):
5655 objs = map(type_test, objs)
5656
5657 return objs, has_branched, isinstance(key, dict)
5658
5659 def _traverse_obj(obj, path, allow_empty, test_type):
5660 results, has_branched, is_dict = apply_path(obj, path, test_type)
5661 results = LazyList(item for item in results if item not in (None, {}))
5662 if get_all and has_branched:
5663 if results:
5664 return results.exhaust()
5665 if allow_empty:
5666 return [] if default is NO_DEFAULT else default
5667 return None
5668
5669 return results[0] if results else {} if allow_empty and is_dict else None
5670
5671 for index, path in enumerate(paths, 1):
5672 result = _traverse_obj(obj, path, index == len(paths), True)
5673 if result is not None:
5674 return result
5675
5676 return None if default is NO_DEFAULT else default
5677
5678
5679 def traverse_dict(dictn, keys, casesense=True):
5680 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5681 f'in a future version. Use "{__name__}.traverse_obj" instead')
5682 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5683
5684
5685 def get_first(obj, keys, **kwargs):
5686 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5687
5688
5689 def time_seconds(**kwargs):
5690 """
5691 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5692 """
5693 return time.time() + datetime.timedelta(**kwargs).total_seconds()
5694
5695
5696 # create a JSON Web Signature (jws) with HS256 algorithm
5697 # the resulting format is in JWS Compact Serialization
5698 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5699 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5700 def jwt_encode_hs256(payload_data, key, headers={}):
5701 header_data = {
5702 'alg': 'HS256',
5703 'typ': 'JWT',
5704 }
5705 if headers:
5706 header_data.update(headers)
5707 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5708 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5709 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5710 signature_b64 = base64.b64encode(h.digest())
5711 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5712 return token
5713
5714
5715 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5716 def jwt_decode_hs256(jwt):
5717 header_b64, payload_b64, signature_b64 = jwt.split('.')
5718 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5719 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5720 return payload_data
5721
5722
5723 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5724
5725
5726 @functools.cache
5727 def supports_terminal_sequences(stream):
5728 if compat_os_name == 'nt':
5729 if not WINDOWS_VT_MODE:
5730 return False
5731 elif not os.getenv('TERM'):
5732 return False
5733 try:
5734 return stream.isatty()
5735 except BaseException:
5736 return False
5737
5738
5739 def windows_enable_vt_mode():
5740 """Ref: https://bugs.python.org/issue30075 """
5741 if get_windows_version() < (10, 0, 10586):
5742 return
5743
5744 import ctypes
5745 import ctypes.wintypes
5746 import msvcrt
5747
5748 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5749
5750 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5751 handle = os.open('CONOUT$', os.O_RDWR)
5752 try:
5753 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5754 dw_original_mode = ctypes.wintypes.DWORD()
5755 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5756 if not success:
5757 raise Exception('GetConsoleMode failed')
5758
5759 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5760 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5761 if not success:
5762 raise Exception('SetConsoleMode failed')
5763 finally:
5764 os.close(handle)
5765
5766 global WINDOWS_VT_MODE
5767 WINDOWS_VT_MODE = True
5768 supports_terminal_sequences.cache_clear()
5769
5770
5771 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5772
5773
5774 def remove_terminal_sequences(string):
5775 return _terminal_sequences_re.sub('', string)
5776
5777
5778 def number_of_digits(number):
5779 return len('%d' % number)
5780
5781
5782 def join_nonempty(*values, delim='-', from_dict=None):
5783 if from_dict is not None:
5784 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5785 return delim.join(map(str, filter(None, values)))
5786
5787
5788 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5789 """
5790 Find the largest format dimensions in terms of video width and, for each thumbnail:
5791 * Modify the URL: Match the width with the provided regex and replace with the former width
5792 * Update dimensions
5793
5794 This function is useful with video services that scale the provided thumbnails on demand
5795 """
5796 _keys = ('width', 'height')
5797 max_dimensions = max(
5798 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5799 default=(0, 0))
5800 if not max_dimensions[0]:
5801 return thumbnails
5802 return [
5803 merge_dicts(
5804 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5805 dict(zip(_keys, max_dimensions)), thumbnail)
5806 for thumbnail in thumbnails
5807 ]
5808
5809
5810 def parse_http_range(range):
5811 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5812 if not range:
5813 return None, None, None
5814 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5815 if not crg:
5816 return None, None, None
5817 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5818
5819
5820 def read_stdin(what):
5821 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5822 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5823 return sys.stdin
5824
5825
5826 def determine_file_encoding(data):
5827 """
5828 Detect the text encoding used
5829 @returns (encoding, bytes to skip)
5830 """
5831
5832 # BOM marks are given priority over declarations
5833 for bom, enc in BOMS:
5834 if data.startswith(bom):
5835 return enc, len(bom)
5836
5837 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5838 # We ignore the endianness to get a good enough match
5839 data = data.replace(b'\0', b'')
5840 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5841 return mobj.group(1).decode() if mobj else None, 0
5842
5843
5844 class Config:
5845 own_args = None
5846 parsed_args = None
5847 filename = None
5848 __initialized = False
5849
5850 def __init__(self, parser, label=None):
5851 self.parser, self.label = parser, label
5852 self._loaded_paths, self.configs = set(), []
5853
5854 def init(self, args=None, filename=None):
5855 assert not self.__initialized
5856 self.own_args, self.filename = args, filename
5857 return self.load_configs()
5858
5859 def load_configs(self):
5860 directory = ''
5861 if self.filename:
5862 location = os.path.realpath(self.filename)
5863 directory = os.path.dirname(location)
5864 if location in self._loaded_paths:
5865 return False
5866 self._loaded_paths.add(location)
5867
5868 self.__initialized = True
5869 opts, _ = self.parser.parse_known_args(self.own_args)
5870 self.parsed_args = self.own_args
5871 for location in opts.config_locations or []:
5872 if location == '-':
5873 if location in self._loaded_paths:
5874 continue
5875 self._loaded_paths.add(location)
5876 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5877 continue
5878 location = os.path.join(directory, expand_path(location))
5879 if os.path.isdir(location):
5880 location = os.path.join(location, 'yt-dlp.conf')
5881 if not os.path.exists(location):
5882 self.parser.error(f'config location {location} does not exist')
5883 self.append_config(self.read_file(location), location)
5884 return True
5885
5886 def __str__(self):
5887 label = join_nonempty(
5888 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5889 delim=' ')
5890 return join_nonempty(
5891 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5892 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5893 delim='\n')
5894
5895 @staticmethod
5896 def read_file(filename, default=[]):
5897 try:
5898 optionf = open(filename, 'rb')
5899 except OSError:
5900 return default # silently skip if file is not present
5901 try:
5902 enc, skip = determine_file_encoding(optionf.read(512))
5903 optionf.seek(skip, io.SEEK_SET)
5904 except OSError:
5905 enc = None # silently skip read errors
5906 try:
5907 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5908 contents = optionf.read().decode(enc or preferredencoding())
5909 res = shlex.split(contents, comments=True)
5910 except Exception as err:
5911 raise ValueError(f'Unable to parse "{filename}": {err}')
5912 finally:
5913 optionf.close()
5914 return res
5915
5916 @staticmethod
5917 def hide_login_info(opts):
5918 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5919 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5920
5921 def _scrub_eq(o):
5922 m = eqre.match(o)
5923 if m:
5924 return m.group('key') + '=PRIVATE'
5925 else:
5926 return o
5927
5928 opts = list(map(_scrub_eq, opts))
5929 for idx, opt in enumerate(opts):
5930 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5931 opts[idx + 1] = 'PRIVATE'
5932 return opts
5933
5934 def append_config(self, *args, label=None):
5935 config = type(self)(self.parser, label)
5936 config._loaded_paths = self._loaded_paths
5937 if config.init(*args):
5938 self.configs.append(config)
5939
5940 @property
5941 def all_args(self):
5942 for config in reversed(self.configs):
5943 yield from config.all_args
5944 yield from self.parsed_args or []
5945
5946 def parse_known_args(self, **kwargs):
5947 return self.parser.parse_known_args(self.all_args, **kwargs)
5948
5949 def parse_args(self):
5950 return self.parser.parse_args(self.all_args)
5951
5952
5953 class WebSocketsWrapper:
5954 """Wraps websockets module to use in non-async scopes"""
5955 pool = None
5956
5957 def __init__(self, url, headers=None, connect=True):
5958 self.loop = asyncio.new_event_loop()
5959 # XXX: "loop" is deprecated
5960 self.conn = websockets.connect(
5961 url, extra_headers=headers, ping_interval=None,
5962 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5963 if connect:
5964 self.__enter__()
5965 atexit.register(self.__exit__, None, None, None)
5966
5967 def __enter__(self):
5968 if not self.pool:
5969 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5970 return self
5971
5972 def send(self, *args):
5973 self.run_with_loop(self.pool.send(*args), self.loop)
5974
5975 def recv(self, *args):
5976 return self.run_with_loop(self.pool.recv(*args), self.loop)
5977
5978 def __exit__(self, type, value, traceback):
5979 try:
5980 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5981 finally:
5982 self.loop.close()
5983 self._cancel_all_tasks(self.loop)
5984
5985 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5986 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5987 @staticmethod
5988 def run_with_loop(main, loop):
5989 if not asyncio.iscoroutine(main):
5990 raise ValueError(f'a coroutine was expected, got {main!r}')
5991
5992 try:
5993 return loop.run_until_complete(main)
5994 finally:
5995 loop.run_until_complete(loop.shutdown_asyncgens())
5996 if hasattr(loop, 'shutdown_default_executor'):
5997 loop.run_until_complete(loop.shutdown_default_executor())
5998
5999 @staticmethod
6000 def _cancel_all_tasks(loop):
6001 to_cancel = asyncio.all_tasks(loop)
6002
6003 if not to_cancel:
6004 return
6005
6006 for task in to_cancel:
6007 task.cancel()
6008
6009 # XXX: "loop" is removed in python 3.10+
6010 loop.run_until_complete(
6011 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
6012
6013 for task in to_cancel:
6014 if task.cancelled():
6015 continue
6016 if task.exception() is not None:
6017 loop.call_exception_handler({
6018 'message': 'unhandled exception during asyncio.run() shutdown',
6019 'exception': task.exception(),
6020 'task': task,
6021 })
6022
6023
6024 def merge_headers(*dicts):
6025 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
6026 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
6027
6028
6029 def cached_method(f):
6030 """Cache a method"""
6031 signature = inspect.signature(f)
6032
6033 @functools.wraps(f)
6034 def wrapper(self, *args, **kwargs):
6035 bound_args = signature.bind(self, *args, **kwargs)
6036 bound_args.apply_defaults()
6037 key = tuple(bound_args.arguments.values())[1:]
6038
6039 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
6040 if key not in cache:
6041 cache[key] = f(self, *args, **kwargs)
6042 return cache[key]
6043 return wrapper
6044
6045
6046 class classproperty:
6047 """property access for class methods with optional caching"""
6048 def __new__(cls, func=None, *args, **kwargs):
6049 if not func:
6050 return functools.partial(cls, *args, **kwargs)
6051 return super().__new__(cls)
6052
6053 def __init__(self, func, *, cache=False):
6054 functools.update_wrapper(self, func)
6055 self.func = func
6056 self._cache = {} if cache else None
6057
6058 def __get__(self, _, cls):
6059 if self._cache is None:
6060 return self.func(cls)
6061 elif cls not in self._cache:
6062 self._cache[cls] = self.func(cls)
6063 return self._cache[cls]
6064
6065
6066 class function_with_repr:
6067 def __init__(self, func, repr_=None):
6068 functools.update_wrapper(self, func)
6069 self.func, self.__repr = func, repr_
6070
6071 def __call__(self, *args, **kwargs):
6072 return self.func(*args, **kwargs)
6073
6074 def __repr__(self):
6075 if self.__repr:
6076 return self.__repr
6077 return f'{self.func.__module__}.{self.func.__qualname__}'
6078
6079
6080 class Namespace(types.SimpleNamespace):
6081 """Immutable namespace"""
6082
6083 def __iter__(self):
6084 return iter(self.__dict__.values())
6085
6086 @property
6087 def items_(self):
6088 return self.__dict__.items()
6089
6090
6091 MEDIA_EXTENSIONS = Namespace(
6092 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
6093 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
6094 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
6095 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
6096 thumbnails=('jpg', 'png', 'webp'),
6097 storyboards=('mhtml', ),
6098 subtitles=('srt', 'vtt', 'ass', 'lrc'),
6099 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
6100 )
6101 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
6102 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
6103
6104 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
6105
6106
6107 class RetryManager:
6108 """Usage:
6109 for retry in RetryManager(...):
6110 try:
6111 ...
6112 except SomeException as err:
6113 retry.error = err
6114 continue
6115 """
6116 attempt, _error = 0, None
6117
6118 def __init__(self, _retries, _error_callback, **kwargs):
6119 self.retries = _retries or 0
6120 self.error_callback = functools.partial(_error_callback, **kwargs)
6121
6122 def _should_retry(self):
6123 return self._error is not NO_DEFAULT and self.attempt <= self.retries
6124
6125 @property
6126 def error(self):
6127 if self._error is NO_DEFAULT:
6128 return None
6129 return self._error
6130
6131 @error.setter
6132 def error(self, value):
6133 self._error = value
6134
6135 def __iter__(self):
6136 while self._should_retry():
6137 self.error = NO_DEFAULT
6138 self.attempt += 1
6139 yield self
6140 if self.error:
6141 self.error_callback(self.error, self.attempt, self.retries)
6142
6143 @staticmethod
6144 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
6145 """Utility function for reporting retries"""
6146 if count > retries:
6147 if error:
6148 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
6149 raise e
6150
6151 if not count:
6152 return warn(e)
6153 elif isinstance(e, ExtractorError):
6154 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
6155 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
6156
6157 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
6158 if delay:
6159 info(f'Sleeping {delay:.2f} seconds ...')
6160 time.sleep(delay)
6161
6162
6163 def make_archive_id(ie, video_id):
6164 ie_key = ie if isinstance(ie, str) else ie.ie_key()
6165 return f'{ie_key.lower()} {video_id}'
6166
6167
6168 def truncate_string(s, left, right=0):
6169 assert left > 3 and right >= 0
6170 if s is None or len(s) <= left + right:
6171 return s
6172 return f'{s[:left-3]}...{s[-right:] if right else ""}'
6173
6174
6175 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6176 assert 'all' in alias_dict, '"all" alias is required'
6177 requested = list(start or [])
6178 for val in options:
6179 discard = val.startswith('-')
6180 if discard:
6181 val = val[1:]
6182
6183 if val in alias_dict:
6184 val = alias_dict[val] if not discard else [
6185 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6186 # NB: Do not allow regex in aliases for performance
6187 requested = orderedSet_from_options(val, alias_dict, start=requested)
6188 continue
6189
6190 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6191 else [val] if val in alias_dict['all'] else None)
6192 if current is None:
6193 raise ValueError(val)
6194
6195 if discard:
6196 for item in current:
6197 while item in requested:
6198 requested.remove(item)
6199 else:
6200 requested.extend(current)
6201
6202 return orderedSet(requested)
6203
6204
6205 class FormatSorter:
6206 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6207
6208 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6209 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6210 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6211 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6212 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6213 'fps', 'fs_approx', 'source', 'id')
6214
6215 settings = {
6216 'vcodec': {'type': 'ordered', 'regex': True,
6217 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6218 'acodec': {'type': 'ordered', 'regex': True,
6219 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6220 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6221 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6222 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6223 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6224 'vext': {'type': 'ordered', 'field': 'video_ext',
6225 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6226 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
6227 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
6228 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
6229 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
6230 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6231 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6232 'field': ('vcodec', 'acodec'),
6233 'function': lambda it: int(any(v != 'none' for v in it))},
6234 'ie_pref': {'priority': True, 'type': 'extractor'},
6235 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6236 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6237 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6238 'quality': {'convert': 'float', 'default': -1},
6239 'filesize': {'convert': 'bytes'},
6240 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6241 'id': {'convert': 'string', 'field': 'format_id'},
6242 'height': {'convert': 'float_none'},
6243 'width': {'convert': 'float_none'},
6244 'fps': {'convert': 'float_none'},
6245 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6246 'tbr': {'convert': 'float_none'},
6247 'vbr': {'convert': 'float_none'},
6248 'abr': {'convert': 'float_none'},
6249 'asr': {'convert': 'float_none'},
6250 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6251
6252 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6253 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6254 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6255 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6256 'res': {'type': 'multiple', 'field': ('height', 'width'),
6257 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6258
6259 # Actual field names
6260 'format_id': {'type': 'alias', 'field': 'id'},
6261 'preference': {'type': 'alias', 'field': 'ie_pref'},
6262 'language_preference': {'type': 'alias', 'field': 'lang'},
6263 'source_preference': {'type': 'alias', 'field': 'source'},
6264 'protocol': {'type': 'alias', 'field': 'proto'},
6265 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6266 'audio_channels': {'type': 'alias', 'field': 'channels'},
6267
6268 # Deprecated
6269 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6270 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6271 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6272 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6273 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6274 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6275 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6276 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6277 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6278 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6279 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6280 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6281 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6282 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6283 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6284 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6285 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6286 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6287 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6288 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6289 }
6290
6291 def __init__(self, ydl, field_preference):
6292 self.ydl = ydl
6293 self._order = []
6294 self.evaluate_params(self.ydl.params, field_preference)
6295 if ydl.params.get('verbose'):
6296 self.print_verbose_info(self.ydl.write_debug)
6297
6298 def _get_field_setting(self, field, key):
6299 if field not in self.settings:
6300 if key in ('forced', 'priority'):
6301 return False
6302 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6303 'deprecated and may be removed in a future version')
6304 self.settings[field] = {}
6305 propObj = self.settings[field]
6306 if key not in propObj:
6307 type = propObj.get('type')
6308 if key == 'field':
6309 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6310 elif key == 'convert':
6311 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6312 else:
6313 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6314 propObj[key] = default
6315 return propObj[key]
6316
6317 def _resolve_field_value(self, field, value, convertNone=False):
6318 if value is None:
6319 if not convertNone:
6320 return None
6321 else:
6322 value = value.lower()
6323 conversion = self._get_field_setting(field, 'convert')
6324 if conversion == 'ignore':
6325 return None
6326 if conversion == 'string':
6327 return value
6328 elif conversion == 'float_none':
6329 return float_or_none(value)
6330 elif conversion == 'bytes':
6331 return parse_bytes(value)
6332 elif conversion == 'order':
6333 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6334 use_regex = self._get_field_setting(field, 'regex')
6335 list_length = len(order_list)
6336 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6337 if use_regex and value is not None:
6338 for i, regex in enumerate(order_list):
6339 if regex and re.match(regex, value):
6340 return list_length - i
6341 return list_length - empty_pos # not in list
6342 else: # not regex or value = None
6343 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6344 else:
6345 if value.isnumeric():
6346 return float(value)
6347 else:
6348 self.settings[field]['convert'] = 'string'
6349 return value
6350
6351 def evaluate_params(self, params, sort_extractor):
6352 self._use_free_order = params.get('prefer_free_formats', False)
6353 self._sort_user = params.get('format_sort', [])
6354 self._sort_extractor = sort_extractor
6355
6356 def add_item(field, reverse, closest, limit_text):
6357 field = field.lower()
6358 if field in self._order:
6359 return
6360 self._order.append(field)
6361 limit = self._resolve_field_value(field, limit_text)
6362 data = {
6363 'reverse': reverse,
6364 'closest': False if limit is None else closest,
6365 'limit_text': limit_text,
6366 'limit': limit}
6367 if field in self.settings:
6368 self.settings[field].update(data)
6369 else:
6370 self.settings[field] = data
6371
6372 sort_list = (
6373 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6374 + (tuple() if params.get('format_sort_force', False)
6375 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6376 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6377
6378 for item in sort_list:
6379 match = re.match(self.regex, item)
6380 if match is None:
6381 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6382 field = match.group('field')
6383 if field is None:
6384 continue
6385 if self._get_field_setting(field, 'type') == 'alias':
6386 alias, field = field, self._get_field_setting(field, 'field')
6387 if self._get_field_setting(alias, 'deprecated'):
6388 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6389 f'be removed in a future version. Please use {field} instead')
6390 reverse = match.group('reverse') is not None
6391 closest = match.group('separator') == '~'
6392 limit_text = match.group('limit')
6393
6394 has_limit = limit_text is not None
6395 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6396 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6397
6398 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6399 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6400 limit_count = len(limits)
6401 for (i, f) in enumerate(fields):
6402 add_item(f, reverse, closest,
6403 limits[i] if i < limit_count
6404 else limits[0] if has_limit and not has_multiple_limits
6405 else None)
6406
6407 def print_verbose_info(self, write_debug):
6408 if self._sort_user:
6409 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6410 if self._sort_extractor:
6411 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6412 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6413 '+' if self._get_field_setting(field, 'reverse') else '', field,
6414 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6415 self._get_field_setting(field, 'limit_text'),
6416 self._get_field_setting(field, 'limit'))
6417 if self._get_field_setting(field, 'limit_text') is not None else '')
6418 for field in self._order if self._get_field_setting(field, 'visible')]))
6419
6420 def _calculate_field_preference_from_value(self, format, field, type, value):
6421 reverse = self._get_field_setting(field, 'reverse')
6422 closest = self._get_field_setting(field, 'closest')
6423 limit = self._get_field_setting(field, 'limit')
6424
6425 if type == 'extractor':
6426 maximum = self._get_field_setting(field, 'max')
6427 if value is None or (maximum is not None and value >= maximum):
6428 value = -1
6429 elif type == 'boolean':
6430 in_list = self._get_field_setting(field, 'in_list')
6431 not_in_list = self._get_field_setting(field, 'not_in_list')
6432 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6433 elif type == 'ordered':
6434 value = self._resolve_field_value(field, value, True)
6435
6436 # try to convert to number
6437 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6438 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6439 if is_num:
6440 value = val_num
6441
6442 return ((-10, 0) if value is None
6443 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6444 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6445 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6446 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6447 else (-1, value, 0))
6448
6449 def _calculate_field_preference(self, format, field):
6450 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6451 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6452 if type == 'multiple':
6453 type = 'field' # Only 'field' is allowed in multiple for now
6454 actual_fields = self._get_field_setting(field, 'field')
6455
6456 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6457 else:
6458 value = get_value(field)
6459 return self._calculate_field_preference_from_value(format, field, type, value)
6460
6461 def calculate_preference(self, format):
6462 # Determine missing protocol
6463 if not format.get('protocol'):
6464 format['protocol'] = determine_protocol(format)
6465
6466 # Determine missing ext
6467 if not format.get('ext') and 'url' in format:
6468 format['ext'] = determine_ext(format['url'])
6469 if format.get('vcodec') == 'none':
6470 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6471 format['video_ext'] = 'none'
6472 else:
6473 format['video_ext'] = format['ext']
6474 format['audio_ext'] = 'none'
6475 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6476 # format['preference'] = -1000
6477
6478 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6479 # HEVC-over-FLV is out-of-spec by FLV's original spec
6480 # ref. https://trac.ffmpeg.org/ticket/6389
6481 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6482 format['preference'] = -100
6483
6484 # Determine missing bitrates
6485 if format.get('tbr') is None:
6486 if format.get('vbr') is not None and format.get('abr') is not None:
6487 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6488 else:
6489 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6490 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6491 if format.get('acodec') != 'none' and format.get('abr') is None:
6492 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6493
6494 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6495
6496
6497 # Deprecated
6498 has_certifi = bool(certifi)
6499 has_websockets = bool(websockets)
6500
6501
6502 def load_plugins(name, suffix, namespace):
6503 from .plugins import load_plugins
6504 ret = load_plugins(name, suffix)
6505 namespace.update(ret)
6506 return ret