]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[extractor/youtube] Support podcasts and releases tabs
[yt-dlp.git] / yt_dlp / utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import html.entities
18 import html.parser
19 import http.client
20 import http.cookiejar
21 import inspect
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import mimetypes
28 import operator
29 import os
30 import platform
31 import random
32 import re
33 import shlex
34 import socket
35 import ssl
36 import struct
37 import subprocess
38 import sys
39 import tempfile
40 import time
41 import traceback
42 import types
43 import unicodedata
44 import urllib.error
45 import urllib.parse
46 import urllib.request
47 import xml.etree.ElementTree
48 import zlib
49
50 from .compat import functools # isort: split
51 from .compat import (
52 compat_etree_fromstring,
53 compat_expanduser,
54 compat_HTMLParseError,
55 compat_os_name,
56 compat_shlex_quote,
57 )
58 from .dependencies import brotli, certifi, websockets, xattr
59 from .socks import ProxyType, sockssocket
60
61
62 def register_socks_protocols():
63 # "Register" SOCKS protocols
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
69
70
71 # This is not clearly defined otherwise
72 compiled_regex_type = type(re.compile(''))
73
74
75 def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
120 SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122 ]
123 if brotli:
124 SUPPORTED_ENCODINGS.append('br')
125
126 std_headers = {
127 'User-Agent': random_user_agent(),
128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
129 'Accept-Language': 'en-us,en;q=0.5',
130 'Sec-Fetch-Mode': 'navigate',
131 }
132
133
134 USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136 }
137
138
139 NO_DEFAULT = object()
140 IDENTITY = lambda x: x
141
142 ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
146 MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
151 # these follow the genitive grammatical case (dopełniacz)
152 # some websites might be using nominative, which will require another month list
153 # https://en.wikibooks.org/wiki/Polish/Noun_cases
154 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
155 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
156 }
157
158 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
159 TIMEZONE_NAMES = {
160 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
161 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
162 'EST': -5, 'EDT': -4, # Eastern
163 'CST': -6, 'CDT': -5, # Central
164 'MST': -7, 'MDT': -6, # Mountain
165 'PST': -8, 'PDT': -7 # Pacific
166 }
167
168 # needed for sanitizing filenames in restricted mode
169 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
170 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
171 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
172
173 DATE_FORMATS = (
174 '%d %B %Y',
175 '%d %b %Y',
176 '%B %d %Y',
177 '%B %dst %Y',
178 '%B %dnd %Y',
179 '%B %drd %Y',
180 '%B %dth %Y',
181 '%b %d %Y',
182 '%b %dst %Y',
183 '%b %dnd %Y',
184 '%b %drd %Y',
185 '%b %dth %Y',
186 '%b %dst %Y %I:%M',
187 '%b %dnd %Y %I:%M',
188 '%b %drd %Y %I:%M',
189 '%b %dth %Y %I:%M',
190 '%Y %m %d',
191 '%Y-%m-%d',
192 '%Y.%m.%d.',
193 '%Y/%m/%d',
194 '%Y/%m/%d %H:%M',
195 '%Y/%m/%d %H:%M:%S',
196 '%Y%m%d%H%M',
197 '%Y%m%d%H%M%S',
198 '%Y%m%d',
199 '%Y-%m-%d %H:%M',
200 '%Y-%m-%d %H:%M:%S',
201 '%Y-%m-%d %H:%M:%S.%f',
202 '%Y-%m-%d %H:%M:%S:%f',
203 '%d.%m.%Y %H:%M',
204 '%d.%m.%Y %H.%M',
205 '%Y-%m-%dT%H:%M:%SZ',
206 '%Y-%m-%dT%H:%M:%S.%fZ',
207 '%Y-%m-%dT%H:%M:%S.%f0Z',
208 '%Y-%m-%dT%H:%M:%S',
209 '%Y-%m-%dT%H:%M:%S.%f',
210 '%Y-%m-%dT%H:%M',
211 '%b %d %Y at %H:%M',
212 '%b %d %Y at %H:%M:%S',
213 '%B %d %Y at %H:%M',
214 '%B %d %Y at %H:%M:%S',
215 '%H:%M %d-%b-%Y',
216 )
217
218 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
219 DATE_FORMATS_DAY_FIRST.extend([
220 '%d-%m-%Y',
221 '%d.%m.%Y',
222 '%d.%m.%y',
223 '%d/%m/%Y',
224 '%d/%m/%y',
225 '%d/%m/%Y %H:%M:%S',
226 '%d-%m-%Y %H:%M',
227 ])
228
229 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
230 DATE_FORMATS_MONTH_FIRST.extend([
231 '%m-%d-%Y',
232 '%m.%d.%Y',
233 '%m/%d/%Y',
234 '%m/%d/%y',
235 '%m/%d/%Y %H:%M:%S',
236 ])
237
238 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
239 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
240
241 NUMBER_RE = r'\d+(?:\.\d+)?'
242
243
244 @functools.cache
245 def preferredencoding():
246 """Get preferred encoding.
247
248 Returns the best encoding scheme for the system, based on
249 locale.getpreferredencoding() and some further tweaks.
250 """
251 try:
252 pref = locale.getpreferredencoding()
253 'TEST'.encode(pref)
254 except Exception:
255 pref = 'UTF-8'
256
257 return pref
258
259
260 def write_json_file(obj, fn):
261 """ Encode obj as JSON and write it to fn, atomically if possible """
262
263 tf = tempfile.NamedTemporaryFile(
264 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
265 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
266
267 try:
268 with tf:
269 json.dump(obj, tf, ensure_ascii=False)
270 if sys.platform == 'win32':
271 # Need to remove existing file on Windows, else os.rename raises
272 # WindowsError or FileExistsError.
273 with contextlib.suppress(OSError):
274 os.unlink(fn)
275 with contextlib.suppress(OSError):
276 mask = os.umask(0)
277 os.umask(mask)
278 os.chmod(tf.name, 0o666 & ~mask)
279 os.rename(tf.name, fn)
280 except Exception:
281 with contextlib.suppress(OSError):
282 os.remove(tf.name)
283 raise
284
285
286 def find_xpath_attr(node, xpath, key, val=None):
287 """ Find the xpath xpath[@key=val] """
288 assert re.match(r'^[a-zA-Z_-]+$', key)
289 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
290 return node.find(expr)
291
292 # On python2.6 the xml.etree.ElementTree.Element methods don't support
293 # the namespace parameter
294
295
296 def xpath_with_ns(path, ns_map):
297 components = [c.split(':') for c in path.split('/')]
298 replaced = []
299 for c in components:
300 if len(c) == 1:
301 replaced.append(c[0])
302 else:
303 ns, tag = c
304 replaced.append('{%s}%s' % (ns_map[ns], tag))
305 return '/'.join(replaced)
306
307
308 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
309 def _find_xpath(xpath):
310 return node.find(xpath)
311
312 if isinstance(xpath, str):
313 n = _find_xpath(xpath)
314 else:
315 for xp in xpath:
316 n = _find_xpath(xp)
317 if n is not None:
318 break
319
320 if n is None:
321 if default is not NO_DEFAULT:
322 return default
323 elif fatal:
324 name = xpath if name is None else name
325 raise ExtractorError('Could not find XML element %s' % name)
326 else:
327 return None
328 return n
329
330
331 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
332 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
333 if n is None or n == default:
334 return n
335 if n.text is None:
336 if default is not NO_DEFAULT:
337 return default
338 elif fatal:
339 name = xpath if name is None else name
340 raise ExtractorError('Could not find XML element\'s text %s' % name)
341 else:
342 return None
343 return n.text
344
345
346 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
347 n = find_xpath_attr(node, xpath, key)
348 if n is None:
349 if default is not NO_DEFAULT:
350 return default
351 elif fatal:
352 name = f'{xpath}[@{key}]' if name is None else name
353 raise ExtractorError('Could not find XML attribute %s' % name)
354 else:
355 return None
356 return n.attrib[key]
357
358
359 def get_element_by_id(id, html, **kwargs):
360 """Return the content of the tag with the specified ID in the passed HTML document"""
361 return get_element_by_attribute('id', id, html, **kwargs)
362
363
364 def get_element_html_by_id(id, html, **kwargs):
365 """Return the html of the tag with the specified ID in the passed HTML document"""
366 return get_element_html_by_attribute('id', id, html, **kwargs)
367
368
369 def get_element_by_class(class_name, html):
370 """Return the content of the first tag with the specified class in the passed HTML document"""
371 retval = get_elements_by_class(class_name, html)
372 return retval[0] if retval else None
373
374
375 def get_element_html_by_class(class_name, html):
376 """Return the html of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_html_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
381 def get_element_by_attribute(attribute, value, html, **kwargs):
382 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
383 return retval[0] if retval else None
384
385
386 def get_element_html_by_attribute(attribute, value, html, **kargs):
387 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
388 return retval[0] if retval else None
389
390
391 def get_elements_by_class(class_name, html, **kargs):
392 """Return the content of all tags with the specified class in the passed HTML document as a list"""
393 return get_elements_by_attribute(
394 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
395 html, escape_value=False)
396
397
398 def get_elements_html_by_class(class_name, html):
399 """Return the html of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_html_by_attribute(
401 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
402 html, escape_value=False)
403
404
405 def get_elements_by_attribute(*args, **kwargs):
406 """Return the content of the tag with the specified attribute in the passed HTML document"""
407 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
408
409
410 def get_elements_html_by_attribute(*args, **kwargs):
411 """Return the html of the tag with the specified attribute in the passed HTML document"""
412 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
413
414
415 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
416 """
417 Return the text (content) and the html (whole) of the tag with the specified
418 attribute in the passed HTML document
419 """
420 if not value:
421 return
422
423 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
424
425 value = re.escape(value) if escape_value else value
426
427 partial_element_re = rf'''(?x)
428 <(?P<tag>{tag})
429 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
430 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
431 '''
432
433 for m in re.finditer(partial_element_re, html):
434 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
435
436 yield (
437 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
438 whole
439 )
440
441
442 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
443 """
444 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
445 closing tag for the first opening tag it has encountered, and can be used
446 as a context manager
447 """
448
449 class HTMLBreakOnClosingTagException(Exception):
450 pass
451
452 def __init__(self):
453 self.tagstack = collections.deque()
454 html.parser.HTMLParser.__init__(self)
455
456 def __enter__(self):
457 return self
458
459 def __exit__(self, *_):
460 self.close()
461
462 def close(self):
463 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
464 # so data remains buffered; we no longer have any interest in it, thus
465 # override this method to discard it
466 pass
467
468 def handle_starttag(self, tag, _):
469 self.tagstack.append(tag)
470
471 def handle_endtag(self, tag):
472 if not self.tagstack:
473 raise compat_HTMLParseError('no tags in the stack')
474 while self.tagstack:
475 inner_tag = self.tagstack.pop()
476 if inner_tag == tag:
477 break
478 else:
479 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
480 if not self.tagstack:
481 raise self.HTMLBreakOnClosingTagException()
482
483
484 # XXX: This should be far less strict
485 def get_element_text_and_html_by_tag(tag, html):
486 """
487 For the first element with the specified tag in the passed HTML document
488 return its' content (text) and the whole element (html)
489 """
490 def find_or_raise(haystack, needle, exc):
491 try:
492 return haystack.index(needle)
493 except ValueError:
494 raise exc
495 closing_tag = f'</{tag}>'
496 whole_start = find_or_raise(
497 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
498 content_start = find_or_raise(
499 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
500 content_start += whole_start + 1
501 with HTMLBreakOnClosingTagParser() as parser:
502 parser.feed(html[whole_start:content_start])
503 if not parser.tagstack or parser.tagstack[0] != tag:
504 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
505 offset = content_start
506 while offset < len(html):
507 next_closing_tag_start = find_or_raise(
508 html[offset:], closing_tag,
509 compat_HTMLParseError(f'closing {tag} tag not found'))
510 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
511 try:
512 parser.feed(html[offset:offset + next_closing_tag_end])
513 offset += next_closing_tag_end
514 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
515 return html[content_start:offset + next_closing_tag_start], \
516 html[whole_start:offset + next_closing_tag_end]
517 raise compat_HTMLParseError('unexpected end of html')
518
519
520 class HTMLAttributeParser(html.parser.HTMLParser):
521 """Trivial HTML parser to gather the attributes for a single element"""
522
523 def __init__(self):
524 self.attrs = {}
525 html.parser.HTMLParser.__init__(self)
526
527 def handle_starttag(self, tag, attrs):
528 self.attrs = dict(attrs)
529 raise compat_HTMLParseError('done')
530
531
532 class HTMLListAttrsParser(html.parser.HTMLParser):
533 """HTML parser to gather the attributes for the elements of a list"""
534
535 def __init__(self):
536 html.parser.HTMLParser.__init__(self)
537 self.items = []
538 self._level = 0
539
540 def handle_starttag(self, tag, attrs):
541 if tag == 'li' and self._level == 0:
542 self.items.append(dict(attrs))
543 self._level += 1
544
545 def handle_endtag(self, tag):
546 self._level -= 1
547
548
549 def extract_attributes(html_element):
550 """Given a string for an HTML element such as
551 <el
552 a="foo" B="bar" c="&98;az" d=boz
553 empty= noval entity="&amp;"
554 sq='"' dq="'"
555 >
556 Decode and return a dictionary of attributes.
557 {
558 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
559 'empty': '', 'noval': None, 'entity': '&',
560 'sq': '"', 'dq': '\''
561 }.
562 """
563 parser = HTMLAttributeParser()
564 with contextlib.suppress(compat_HTMLParseError):
565 parser.feed(html_element)
566 parser.close()
567 return parser.attrs
568
569
570 def parse_list(webpage):
571 """Given a string for an series of HTML <li> elements,
572 return a dictionary of their attributes"""
573 parser = HTMLListAttrsParser()
574 parser.feed(webpage)
575 parser.close()
576 return parser.items
577
578
579 def clean_html(html):
580 """Clean an HTML snippet into a readable string"""
581
582 if html is None: # Convenience for sanitizing descriptions etc.
583 return html
584
585 html = re.sub(r'\s+', ' ', html)
586 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
587 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
588 # Strip html tags
589 html = re.sub('<.*?>', '', html)
590 # Replace html entities
591 html = unescapeHTML(html)
592 return html.strip()
593
594
595 class LenientJSONDecoder(json.JSONDecoder):
596 # TODO: Write tests
597 def __init__(self, *args, transform_source=None, ignore_extra=False, close_objects=0, **kwargs):
598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
599 self._close_attempts = 2 * close_objects
600 super().__init__(*args, **kwargs)
601
602 @staticmethod
603 def _close_object(err):
604 doc = err.doc[:err.pos]
605 # We need to add comma first to get the correct error message
606 if err.msg.startswith('Expecting \',\''):
607 return doc + ','
608 elif not doc.endswith(','):
609 return
610
611 if err.msg.startswith('Expecting property name'):
612 return doc[:-1] + '}'
613 elif err.msg.startswith('Expecting value'):
614 return doc[:-1] + ']'
615
616 def decode(self, s):
617 if self.transform_source:
618 s = self.transform_source(s)
619 for attempt in range(self._close_attempts + 1):
620 try:
621 if self.ignore_extra:
622 return self.raw_decode(s.lstrip())[0]
623 return super().decode(s)
624 except json.JSONDecodeError as e:
625 if e.pos is None:
626 raise
627 elif attempt < self._close_attempts:
628 s = self._close_object(e)
629 if s is not None:
630 continue
631 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
632 assert False, 'Too many attempts to decode JSON'
633
634
635 def sanitize_open(filename, open_mode):
636 """Try to open the given filename, and slightly tweak it if this fails.
637
638 Attempts to open the given filename. If this fails, it tries to change
639 the filename slightly, step by step, until it's either able to open it
640 or it fails and raises a final exception, like the standard open()
641 function.
642
643 It returns the tuple (stream, definitive_file_name).
644 """
645 if filename == '-':
646 if sys.platform == 'win32':
647 import msvcrt
648
649 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
650 with contextlib.suppress(io.UnsupportedOperation):
651 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
652 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
653
654 for attempt in range(2):
655 try:
656 try:
657 if sys.platform == 'win32':
658 # FIXME: An exclusive lock also locks the file from being read.
659 # Since windows locks are mandatory, don't lock the file on windows (for now).
660 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
661 raise LockingUnsupportedError()
662 stream = locked_file(filename, open_mode, block=False).__enter__()
663 except OSError:
664 stream = open(filename, open_mode)
665 return stream, filename
666 except OSError as err:
667 if attempt or err.errno in (errno.EACCES,):
668 raise
669 old_filename, filename = filename, sanitize_path(filename)
670 if old_filename == filename:
671 raise
672
673
674 def timeconvert(timestr):
675 """Convert RFC 2822 defined time string into system timestamp"""
676 timestamp = None
677 timetuple = email.utils.parsedate_tz(timestr)
678 if timetuple is not None:
679 timestamp = email.utils.mktime_tz(timetuple)
680 return timestamp
681
682
683 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
684 """Sanitizes a string so it could be used as part of a filename.
685 @param restricted Use a stricter subset of allowed characters
686 @param is_id Whether this is an ID that should be kept unchanged if possible.
687 If unset, yt-dlp's new sanitization rules are in effect
688 """
689 if s == '':
690 return ''
691
692 def replace_insane(char):
693 if restricted and char in ACCENT_CHARS:
694 return ACCENT_CHARS[char]
695 elif not restricted and char == '\n':
696 return '\0 '
697 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
698 # Replace with their full-width unicode counterparts
699 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
700 elif char == '?' or ord(char) < 32 or ord(char) == 127:
701 return ''
702 elif char == '"':
703 return '' if restricted else '\''
704 elif char == ':':
705 return '\0_\0-' if restricted else '\0 \0-'
706 elif char in '\\/|*<>':
707 return '\0_'
708 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
709 return '\0_'
710 return char
711
712 # Replace look-alike Unicode glyphs
713 if restricted and (is_id is NO_DEFAULT or not is_id):
714 s = unicodedata.normalize('NFKC', s)
715 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
716 result = ''.join(map(replace_insane, s))
717 if is_id is NO_DEFAULT:
718 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
719 STRIP_RE = r'(?:\0.|[ _-])*'
720 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
721 result = result.replace('\0', '') or '_'
722
723 if not is_id:
724 while '__' in result:
725 result = result.replace('__', '_')
726 result = result.strip('_')
727 # Common case of "Foreign band name - English song title"
728 if restricted and result.startswith('-_'):
729 result = result[2:]
730 if result.startswith('-'):
731 result = '_' + result[len('-'):]
732 result = result.lstrip('.')
733 if not result:
734 result = '_'
735 return result
736
737
738 def sanitize_path(s, force=False):
739 """Sanitizes and normalizes path on Windows"""
740 if sys.platform == 'win32':
741 force = False
742 drive_or_unc, _ = os.path.splitdrive(s)
743 elif force:
744 drive_or_unc = ''
745 else:
746 return s
747
748 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
749 if drive_or_unc:
750 norm_path.pop(0)
751 sanitized_path = [
752 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
753 for path_part in norm_path]
754 if drive_or_unc:
755 sanitized_path.insert(0, drive_or_unc + os.path.sep)
756 elif force and s and s[0] == os.path.sep:
757 sanitized_path.insert(0, os.path.sep)
758 return os.path.join(*sanitized_path)
759
760
761 def sanitize_url(url, *, scheme='http'):
762 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
763 # the number of unwanted failures due to missing protocol
764 if url is None:
765 return
766 elif url.startswith('//'):
767 return f'{scheme}:{url}'
768 # Fix some common typos seen so far
769 COMMON_TYPOS = (
770 # https://github.com/ytdl-org/youtube-dl/issues/15649
771 (r'^httpss://', r'https://'),
772 # https://bx1.be/lives/direct-tv/
773 (r'^rmtp([es]?)://', r'rtmp\1://'),
774 )
775 for mistake, fixup in COMMON_TYPOS:
776 if re.match(mistake, url):
777 return re.sub(mistake, fixup, url)
778 return url
779
780
781 def extract_basic_auth(url):
782 parts = urllib.parse.urlsplit(url)
783 if parts.username is None:
784 return url, None
785 url = urllib.parse.urlunsplit(parts._replace(netloc=(
786 parts.hostname if parts.port is None
787 else '%s:%d' % (parts.hostname, parts.port))))
788 auth_payload = base64.b64encode(
789 ('%s:%s' % (parts.username, parts.password or '')).encode())
790 return url, f'Basic {auth_payload.decode()}'
791
792
793 def sanitized_Request(url, *args, **kwargs):
794 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
795 if auth_header is not None:
796 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
797 headers['Authorization'] = auth_header
798 return urllib.request.Request(url, *args, **kwargs)
799
800
801 def expand_path(s):
802 """Expand shell variables and ~"""
803 return os.path.expandvars(compat_expanduser(s))
804
805
806 def orderedSet(iterable, *, lazy=False):
807 """Remove all duplicates from the input iterable"""
808 def _iter():
809 seen = [] # Do not use set since the items can be unhashable
810 for x in iterable:
811 if x not in seen:
812 seen.append(x)
813 yield x
814
815 return _iter() if lazy else list(_iter())
816
817
818 def _htmlentity_transform(entity_with_semicolon):
819 """Transforms an HTML entity to a character."""
820 entity = entity_with_semicolon[:-1]
821
822 # Known non-numeric HTML entity
823 if entity in html.entities.name2codepoint:
824 return chr(html.entities.name2codepoint[entity])
825
826 # TODO: HTML5 allows entities without a semicolon.
827 # E.g. '&Eacuteric' should be decoded as 'Éric'.
828 if entity_with_semicolon in html.entities.html5:
829 return html.entities.html5[entity_with_semicolon]
830
831 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
832 if mobj is not None:
833 numstr = mobj.group(1)
834 if numstr.startswith('x'):
835 base = 16
836 numstr = '0%s' % numstr
837 else:
838 base = 10
839 # See https://github.com/ytdl-org/youtube-dl/issues/7518
840 with contextlib.suppress(ValueError):
841 return chr(int(numstr, base))
842
843 # Unknown entity in name, return its literal representation
844 return '&%s;' % entity
845
846
847 def unescapeHTML(s):
848 if s is None:
849 return None
850 assert isinstance(s, str)
851
852 return re.sub(
853 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
854
855
856 def escapeHTML(text):
857 return (
858 text
859 .replace('&', '&amp;')
860 .replace('<', '&lt;')
861 .replace('>', '&gt;')
862 .replace('"', '&quot;')
863 .replace("'", '&#39;')
864 )
865
866
867 def process_communicate_or_kill(p, *args, **kwargs):
868 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
869 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
870 return Popen.communicate_or_kill(p, *args, **kwargs)
871
872
873 class Popen(subprocess.Popen):
874 if sys.platform == 'win32':
875 _startupinfo = subprocess.STARTUPINFO()
876 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
877 else:
878 _startupinfo = None
879
880 @staticmethod
881 def _fix_pyinstaller_ld_path(env):
882 """Restore LD_LIBRARY_PATH when using PyInstaller
883 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
884 https://github.com/yt-dlp/yt-dlp/issues/4573
885 """
886 if not hasattr(sys, '_MEIPASS'):
887 return
888
889 def _fix(key):
890 orig = env.get(f'{key}_ORIG')
891 if orig is None:
892 env.pop(key, None)
893 else:
894 env[key] = orig
895
896 _fix('LD_LIBRARY_PATH') # Linux
897 _fix('DYLD_LIBRARY_PATH') # macOS
898
899 def __init__(self, *args, env=None, text=False, **kwargs):
900 if env is None:
901 env = os.environ.copy()
902 self._fix_pyinstaller_ld_path(env)
903
904 self.__text_mode = kwargs.get('encoding') or kwargs.get('errors') or text or kwargs.get('universal_newlines')
905 if text is True:
906 kwargs['universal_newlines'] = True # For 3.6 compatibility
907 kwargs.setdefault('encoding', 'utf-8')
908 kwargs.setdefault('errors', 'replace')
909 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
910
911 def communicate_or_kill(self, *args, **kwargs):
912 try:
913 return self.communicate(*args, **kwargs)
914 except BaseException: # Including KeyboardInterrupt
915 self.kill(timeout=None)
916 raise
917
918 def kill(self, *, timeout=0):
919 super().kill()
920 if timeout != 0:
921 self.wait(timeout=timeout)
922
923 @classmethod
924 def run(cls, *args, timeout=None, **kwargs):
925 with cls(*args, **kwargs) as proc:
926 default = '' if proc.__text_mode else b''
927 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
928 return stdout or default, stderr or default, proc.returncode
929
930
931 def get_subprocess_encoding():
932 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
933 # For subprocess calls, encode with locale encoding
934 # Refer to http://stackoverflow.com/a/9951851/35070
935 encoding = preferredencoding()
936 else:
937 encoding = sys.getfilesystemencoding()
938 if encoding is None:
939 encoding = 'utf-8'
940 return encoding
941
942
943 def encodeFilename(s, for_subprocess=False):
944 assert isinstance(s, str)
945 return s
946
947
948 def decodeFilename(b, for_subprocess=False):
949 return b
950
951
952 def encodeArgument(s):
953 # Legacy code that uses byte strings
954 # Uncomment the following line after fixing all post processors
955 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
956 return s if isinstance(s, str) else s.decode('ascii')
957
958
959 def decodeArgument(b):
960 return b
961
962
963 def decodeOption(optval):
964 if optval is None:
965 return optval
966 if isinstance(optval, bytes):
967 optval = optval.decode(preferredencoding())
968
969 assert isinstance(optval, str)
970 return optval
971
972
973 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
974
975
976 def timetuple_from_msec(msec):
977 secs, msec = divmod(msec, 1000)
978 mins, secs = divmod(secs, 60)
979 hrs, mins = divmod(mins, 60)
980 return _timetuple(hrs, mins, secs, msec)
981
982
983 def formatSeconds(secs, delim=':', msec=False):
984 time = timetuple_from_msec(secs * 1000)
985 if time.hours:
986 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
987 elif time.minutes:
988 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
989 else:
990 ret = '%d' % time.seconds
991 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
992
993
994 def _ssl_load_windows_store_certs(ssl_context, storename):
995 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
996 try:
997 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
998 if encoding == 'x509_asn' and (
999 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
1000 except PermissionError:
1001 return
1002 for cert in certs:
1003 with contextlib.suppress(ssl.SSLError):
1004 ssl_context.load_verify_locations(cadata=cert)
1005
1006
1007 def make_HTTPS_handler(params, **kwargs):
1008 opts_check_certificate = not params.get('nocheckcertificate')
1009 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1010 context.check_hostname = opts_check_certificate
1011 if params.get('legacyserverconnect'):
1012 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
1013 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
1014 context.set_ciphers('DEFAULT')
1015 elif (
1016 sys.version_info < (3, 10)
1017 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
1018 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
1019 ):
1020 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
1021 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1022 # in some situations [2][3].
1023 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1024 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
1025 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
1026 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1027 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1028 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1029 # 4. https://peps.python.org/pep-0644/
1030 # 5. https://peps.python.org/pep-0644/#libressl-support
1031 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1032 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1033 context.minimum_version = ssl.TLSVersion.TLSv1_2
1034
1035 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1036 if opts_check_certificate:
1037 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1038 context.load_verify_locations(cafile=certifi.where())
1039 else:
1040 try:
1041 context.load_default_certs()
1042 # Work around the issue in load_default_certs when there are bad certificates. See:
1043 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1044 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1045 except ssl.SSLError:
1046 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1047 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1048 for storename in ('CA', 'ROOT'):
1049 _ssl_load_windows_store_certs(context, storename)
1050 context.set_default_verify_paths()
1051
1052 client_certfile = params.get('client_certificate')
1053 if client_certfile:
1054 try:
1055 context.load_cert_chain(
1056 client_certfile, keyfile=params.get('client_certificate_key'),
1057 password=params.get('client_certificate_password'))
1058 except ssl.SSLError:
1059 raise YoutubeDLError('Unable to load client certificate')
1060
1061 # Some servers may reject requests if ALPN extension is not sent. See:
1062 # https://github.com/python/cpython/issues/85140
1063 # https://github.com/yt-dlp/yt-dlp/issues/3878
1064 with contextlib.suppress(NotImplementedError):
1065 context.set_alpn_protocols(['http/1.1'])
1066
1067 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1068
1069
1070 def bug_reports_message(before=';'):
1071 from .update import REPOSITORY
1072
1073 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1074 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1075
1076 before = before.rstrip()
1077 if not before or before.endswith(('.', '!', '?')):
1078 msg = msg[0].title() + msg[1:]
1079
1080 return (before + ' ' if before else '') + msg
1081
1082
1083 class YoutubeDLError(Exception):
1084 """Base exception for YoutubeDL errors."""
1085 msg = None
1086
1087 def __init__(self, msg=None):
1088 if msg is not None:
1089 self.msg = msg
1090 elif self.msg is None:
1091 self.msg = type(self).__name__
1092 super().__init__(self.msg)
1093
1094
1095 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1096 if hasattr(ssl, 'CertificateError'):
1097 network_exceptions.append(ssl.CertificateError)
1098 network_exceptions = tuple(network_exceptions)
1099
1100
1101 class ExtractorError(YoutubeDLError):
1102 """Error during info extraction."""
1103
1104 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1105 """ tb, if given, is the original traceback (so that it can be printed out).
1106 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1107 """
1108 if sys.exc_info()[0] in network_exceptions:
1109 expected = True
1110
1111 self.orig_msg = str(msg)
1112 self.traceback = tb
1113 self.expected = expected
1114 self.cause = cause
1115 self.video_id = video_id
1116 self.ie = ie
1117 self.exc_info = sys.exc_info() # preserve original exception
1118 if isinstance(self.exc_info[1], ExtractorError):
1119 self.exc_info = self.exc_info[1].exc_info
1120 super().__init__(self.__msg)
1121
1122 @property
1123 def __msg(self):
1124 return ''.join((
1125 format_field(self.ie, None, '[%s] '),
1126 format_field(self.video_id, None, '%s: '),
1127 self.orig_msg,
1128 format_field(self.cause, None, ' (caused by %r)'),
1129 '' if self.expected else bug_reports_message()))
1130
1131 def format_traceback(self):
1132 return join_nonempty(
1133 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1134 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1135 delim='\n') or None
1136
1137 def __setattr__(self, name, value):
1138 super().__setattr__(name, value)
1139 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1140 self.msg = self.__msg or type(self).__name__
1141 self.args = (self.msg, ) # Cannot be property
1142
1143
1144 class UnsupportedError(ExtractorError):
1145 def __init__(self, url):
1146 super().__init__(
1147 'Unsupported URL: %s' % url, expected=True)
1148 self.url = url
1149
1150
1151 class RegexNotFoundError(ExtractorError):
1152 """Error when a regex didn't match"""
1153 pass
1154
1155
1156 class GeoRestrictedError(ExtractorError):
1157 """Geographic restriction Error exception.
1158
1159 This exception may be thrown when a video is not available from your
1160 geographic location due to geographic restrictions imposed by a website.
1161 """
1162
1163 def __init__(self, msg, countries=None, **kwargs):
1164 kwargs['expected'] = True
1165 super().__init__(msg, **kwargs)
1166 self.countries = countries
1167
1168
1169 class UserNotLive(ExtractorError):
1170 """Error when a channel/user is not live"""
1171
1172 def __init__(self, msg=None, **kwargs):
1173 kwargs['expected'] = True
1174 super().__init__(msg or 'The channel is not currently live', **kwargs)
1175
1176
1177 class DownloadError(YoutubeDLError):
1178 """Download Error exception.
1179
1180 This exception may be thrown by FileDownloader objects if they are not
1181 configured to continue on errors. They will contain the appropriate
1182 error message.
1183 """
1184
1185 def __init__(self, msg, exc_info=None):
1186 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1187 super().__init__(msg)
1188 self.exc_info = exc_info
1189
1190
1191 class EntryNotInPlaylist(YoutubeDLError):
1192 """Entry not in playlist exception.
1193
1194 This exception will be thrown by YoutubeDL when a requested entry
1195 is not found in the playlist info_dict
1196 """
1197 msg = 'Entry not found in info'
1198
1199
1200 class SameFileError(YoutubeDLError):
1201 """Same File exception.
1202
1203 This exception will be thrown by FileDownloader objects if they detect
1204 multiple files would have to be downloaded to the same file on disk.
1205 """
1206 msg = 'Fixed output name but more than one file to download'
1207
1208 def __init__(self, filename=None):
1209 if filename is not None:
1210 self.msg += f': {filename}'
1211 super().__init__(self.msg)
1212
1213
1214 class PostProcessingError(YoutubeDLError):
1215 """Post Processing exception.
1216
1217 This exception may be raised by PostProcessor's .run() method to
1218 indicate an error in the postprocessing task.
1219 """
1220
1221
1222 class DownloadCancelled(YoutubeDLError):
1223 """ Exception raised when the download queue should be interrupted """
1224 msg = 'The download was cancelled'
1225
1226
1227 class ExistingVideoReached(DownloadCancelled):
1228 """ --break-on-existing triggered """
1229 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1230
1231
1232 class RejectedVideoReached(DownloadCancelled):
1233 """ --break-match-filter triggered """
1234 msg = 'Encountered a video that did not match filter, stopping due to --break-match-filter'
1235
1236
1237 class MaxDownloadsReached(DownloadCancelled):
1238 """ --max-downloads limit has been reached. """
1239 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1240
1241
1242 class ReExtractInfo(YoutubeDLError):
1243 """ Video info needs to be re-extracted. """
1244
1245 def __init__(self, msg, expected=False):
1246 super().__init__(msg)
1247 self.expected = expected
1248
1249
1250 class ThrottledDownload(ReExtractInfo):
1251 """ Download speed below --throttled-rate. """
1252 msg = 'The download speed is below throttle limit'
1253
1254 def __init__(self):
1255 super().__init__(self.msg, expected=False)
1256
1257
1258 class UnavailableVideoError(YoutubeDLError):
1259 """Unavailable Format exception.
1260
1261 This exception will be thrown when a video is requested
1262 in a format that is not available for that video.
1263 """
1264 msg = 'Unable to download video'
1265
1266 def __init__(self, err=None):
1267 if err is not None:
1268 self.msg += f': {err}'
1269 super().__init__(self.msg)
1270
1271
1272 class ContentTooShortError(YoutubeDLError):
1273 """Content Too Short exception.
1274
1275 This exception may be raised by FileDownloader objects when a file they
1276 download is too small for what the server announced first, indicating
1277 the connection was probably interrupted.
1278 """
1279
1280 def __init__(self, downloaded, expected):
1281 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1282 # Both in bytes
1283 self.downloaded = downloaded
1284 self.expected = expected
1285
1286
1287 class XAttrMetadataError(YoutubeDLError):
1288 def __init__(self, code=None, msg='Unknown error'):
1289 super().__init__(msg)
1290 self.code = code
1291 self.msg = msg
1292
1293 # Parsing code and msg
1294 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1295 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1296 self.reason = 'NO_SPACE'
1297 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1298 self.reason = 'VALUE_TOO_LONG'
1299 else:
1300 self.reason = 'NOT_SUPPORTED'
1301
1302
1303 class XAttrUnavailableError(YoutubeDLError):
1304 pass
1305
1306
1307 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1308 hc = http_class(*args, **kwargs)
1309 source_address = ydl_handler._params.get('source_address')
1310
1311 if source_address is not None:
1312 # This is to workaround _create_connection() from socket where it will try all
1313 # address data from getaddrinfo() including IPv6. This filters the result from
1314 # getaddrinfo() based on the source_address value.
1315 # This is based on the cpython socket.create_connection() function.
1316 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1317 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1318 host, port = address
1319 err = None
1320 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1321 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1322 ip_addrs = [addr for addr in addrs if addr[0] == af]
1323 if addrs and not ip_addrs:
1324 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1325 raise OSError(
1326 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1327 % (ip_version, source_address[0]))
1328 for res in ip_addrs:
1329 af, socktype, proto, canonname, sa = res
1330 sock = None
1331 try:
1332 sock = socket.socket(af, socktype, proto)
1333 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1334 sock.settimeout(timeout)
1335 sock.bind(source_address)
1336 sock.connect(sa)
1337 err = None # Explicitly break reference cycle
1338 return sock
1339 except OSError as _:
1340 err = _
1341 if sock is not None:
1342 sock.close()
1343 if err is not None:
1344 raise err
1345 else:
1346 raise OSError('getaddrinfo returns an empty list')
1347 if hasattr(hc, '_create_connection'):
1348 hc._create_connection = _create_connection
1349 hc.source_address = (source_address, 0)
1350
1351 return hc
1352
1353
1354 def handle_youtubedl_headers(headers):
1355 filtered_headers = headers
1356
1357 if 'Youtubedl-no-compression' in filtered_headers:
1358 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1359 del filtered_headers['Youtubedl-no-compression']
1360
1361 return filtered_headers
1362
1363
1364 class YoutubeDLHandler(urllib.request.HTTPHandler):
1365 """Handler for HTTP requests and responses.
1366
1367 This class, when installed with an OpenerDirector, automatically adds
1368 the standard headers to every HTTP request and handles gzipped and
1369 deflated responses from web servers. If compression is to be avoided in
1370 a particular request, the original request in the program code only has
1371 to include the HTTP header "Youtubedl-no-compression", which will be
1372 removed before making the real request.
1373
1374 Part of this code was copied from:
1375
1376 http://techknack.net/python-urllib2-handlers/
1377
1378 Andrew Rowls, the author of that code, agreed to release it to the
1379 public domain.
1380 """
1381
1382 def __init__(self, params, *args, **kwargs):
1383 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1384 self._params = params
1385
1386 def http_open(self, req):
1387 conn_class = http.client.HTTPConnection
1388
1389 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1390 if socks_proxy:
1391 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1392 del req.headers['Ytdl-socks-proxy']
1393
1394 return self.do_open(functools.partial(
1395 _create_http_connection, self, conn_class, False),
1396 req)
1397
1398 @staticmethod
1399 def deflate(data):
1400 if not data:
1401 return data
1402 try:
1403 return zlib.decompress(data, -zlib.MAX_WBITS)
1404 except zlib.error:
1405 return zlib.decompress(data)
1406
1407 @staticmethod
1408 def brotli(data):
1409 if not data:
1410 return data
1411 return brotli.decompress(data)
1412
1413 def http_request(self, req):
1414 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1415 # always respected by websites, some tend to give out URLs with non percent-encoded
1416 # non-ASCII characters (see telemb.py, ard.py [#3412])
1417 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1418 # To work around aforementioned issue we will replace request's original URL with
1419 # percent-encoded one
1420 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1421 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1422 url = req.get_full_url()
1423 url_escaped = escape_url(url)
1424
1425 # Substitute URL if any change after escaping
1426 if url != url_escaped:
1427 req = update_Request(req, url=url_escaped)
1428
1429 for h, v in self._params.get('http_headers', std_headers).items():
1430 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1431 # The dict keys are capitalized because of this bug by urllib
1432 if h.capitalize() not in req.headers:
1433 req.add_header(h, v)
1434
1435 if 'Accept-encoding' not in req.headers:
1436 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1437
1438 req.headers = handle_youtubedl_headers(req.headers)
1439
1440 return super().do_request_(req)
1441
1442 def http_response(self, req, resp):
1443 old_resp = resp
1444 # gzip
1445 if resp.headers.get('Content-encoding', '') == 'gzip':
1446 content = resp.read()
1447 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1448 try:
1449 uncompressed = io.BytesIO(gz.read())
1450 except OSError as original_ioerror:
1451 # There may be junk add the end of the file
1452 # See http://stackoverflow.com/q/4928560/35070 for details
1453 for i in range(1, 1024):
1454 try:
1455 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1456 uncompressed = io.BytesIO(gz.read())
1457 except OSError:
1458 continue
1459 break
1460 else:
1461 raise original_ioerror
1462 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1463 resp.msg = old_resp.msg
1464 # deflate
1465 if resp.headers.get('Content-encoding', '') == 'deflate':
1466 gz = io.BytesIO(self.deflate(resp.read()))
1467 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1468 resp.msg = old_resp.msg
1469 # brotli
1470 if resp.headers.get('Content-encoding', '') == 'br':
1471 resp = urllib.request.addinfourl(
1472 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1473 resp.msg = old_resp.msg
1474 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1475 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1476 if 300 <= resp.code < 400:
1477 location = resp.headers.get('Location')
1478 if location:
1479 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1480 location = location.encode('iso-8859-1').decode()
1481 location_escaped = escape_url(location)
1482 if location != location_escaped:
1483 del resp.headers['Location']
1484 resp.headers['Location'] = location_escaped
1485 return resp
1486
1487 https_request = http_request
1488 https_response = http_response
1489
1490
1491 def make_socks_conn_class(base_class, socks_proxy):
1492 assert issubclass(base_class, (
1493 http.client.HTTPConnection, http.client.HTTPSConnection))
1494
1495 url_components = urllib.parse.urlparse(socks_proxy)
1496 if url_components.scheme.lower() == 'socks5':
1497 socks_type = ProxyType.SOCKS5
1498 elif url_components.scheme.lower() in ('socks', 'socks4'):
1499 socks_type = ProxyType.SOCKS4
1500 elif url_components.scheme.lower() == 'socks4a':
1501 socks_type = ProxyType.SOCKS4A
1502
1503 def unquote_if_non_empty(s):
1504 if not s:
1505 return s
1506 return urllib.parse.unquote_plus(s)
1507
1508 proxy_args = (
1509 socks_type,
1510 url_components.hostname, url_components.port or 1080,
1511 True, # Remote DNS
1512 unquote_if_non_empty(url_components.username),
1513 unquote_if_non_empty(url_components.password),
1514 )
1515
1516 class SocksConnection(base_class):
1517 def connect(self):
1518 self.sock = sockssocket()
1519 self.sock.setproxy(*proxy_args)
1520 if isinstance(self.timeout, (int, float)):
1521 self.sock.settimeout(self.timeout)
1522 self.sock.connect((self.host, self.port))
1523
1524 if isinstance(self, http.client.HTTPSConnection):
1525 if hasattr(self, '_context'): # Python > 2.6
1526 self.sock = self._context.wrap_socket(
1527 self.sock, server_hostname=self.host)
1528 else:
1529 self.sock = ssl.wrap_socket(self.sock)
1530
1531 return SocksConnection
1532
1533
1534 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1535 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1536 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1537 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1538 self._params = params
1539
1540 def https_open(self, req):
1541 kwargs = {}
1542 conn_class = self._https_conn_class
1543
1544 if hasattr(self, '_context'): # python > 2.6
1545 kwargs['context'] = self._context
1546 if hasattr(self, '_check_hostname'): # python 3.x
1547 kwargs['check_hostname'] = self._check_hostname
1548
1549 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1550 if socks_proxy:
1551 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1552 del req.headers['Ytdl-socks-proxy']
1553
1554 try:
1555 return self.do_open(
1556 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1557 except urllib.error.URLError as e:
1558 if (isinstance(e.reason, ssl.SSLError)
1559 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1560 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1561 raise
1562
1563
1564 def is_path_like(f):
1565 return isinstance(f, (str, bytes, os.PathLike))
1566
1567
1568 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1569 """
1570 See [1] for cookie file format.
1571
1572 1. https://curl.haxx.se/docs/http-cookies.html
1573 """
1574 _HTTPONLY_PREFIX = '#HttpOnly_'
1575 _ENTRY_LEN = 7
1576 _HEADER = '''# Netscape HTTP Cookie File
1577 # This file is generated by yt-dlp. Do not edit.
1578
1579 '''
1580 _CookieFileEntry = collections.namedtuple(
1581 'CookieFileEntry',
1582 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1583
1584 def __init__(self, filename=None, *args, **kwargs):
1585 super().__init__(None, *args, **kwargs)
1586 if is_path_like(filename):
1587 filename = os.fspath(filename)
1588 self.filename = filename
1589
1590 @staticmethod
1591 def _true_or_false(cndn):
1592 return 'TRUE' if cndn else 'FALSE'
1593
1594 @contextlib.contextmanager
1595 def open(self, file, *, write=False):
1596 if is_path_like(file):
1597 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1598 yield f
1599 else:
1600 if write:
1601 file.truncate(0)
1602 yield file
1603
1604 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1605 now = time.time()
1606 for cookie in self:
1607 if (not ignore_discard and cookie.discard
1608 or not ignore_expires and cookie.is_expired(now)):
1609 continue
1610 name, value = cookie.name, cookie.value
1611 if value is None:
1612 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1613 # with no name, whereas http.cookiejar regards it as a
1614 # cookie with no value.
1615 name, value = '', name
1616 f.write('%s\n' % '\t'.join((
1617 cookie.domain,
1618 self._true_or_false(cookie.domain.startswith('.')),
1619 cookie.path,
1620 self._true_or_false(cookie.secure),
1621 str_or_none(cookie.expires, default=''),
1622 name, value
1623 )))
1624
1625 def save(self, filename=None, *args, **kwargs):
1626 """
1627 Save cookies to a file.
1628 Code is taken from CPython 3.6
1629 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1630
1631 if filename is None:
1632 if self.filename is not None:
1633 filename = self.filename
1634 else:
1635 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1636
1637 # Store session cookies with `expires` set to 0 instead of an empty string
1638 for cookie in self:
1639 if cookie.expires is None:
1640 cookie.expires = 0
1641
1642 with self.open(filename, write=True) as f:
1643 f.write(self._HEADER)
1644 self._really_save(f, *args, **kwargs)
1645
1646 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1647 """Load cookies from a file."""
1648 if filename is None:
1649 if self.filename is not None:
1650 filename = self.filename
1651 else:
1652 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1653
1654 def prepare_line(line):
1655 if line.startswith(self._HTTPONLY_PREFIX):
1656 line = line[len(self._HTTPONLY_PREFIX):]
1657 # comments and empty lines are fine
1658 if line.startswith('#') or not line.strip():
1659 return line
1660 cookie_list = line.split('\t')
1661 if len(cookie_list) != self._ENTRY_LEN:
1662 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1663 cookie = self._CookieFileEntry(*cookie_list)
1664 if cookie.expires_at and not cookie.expires_at.isdigit():
1665 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1666 return line
1667
1668 cf = io.StringIO()
1669 with self.open(filename) as f:
1670 for line in f:
1671 try:
1672 cf.write(prepare_line(line))
1673 except http.cookiejar.LoadError as e:
1674 if f'{line.strip()} '[0] in '[{"':
1675 raise http.cookiejar.LoadError(
1676 'Cookies file must be Netscape formatted, not JSON. See '
1677 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1678 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1679 continue
1680 cf.seek(0)
1681 self._really_load(cf, filename, ignore_discard, ignore_expires)
1682 # Session cookies are denoted by either `expires` field set to
1683 # an empty string or 0. MozillaCookieJar only recognizes the former
1684 # (see [1]). So we need force the latter to be recognized as session
1685 # cookies on our own.
1686 # Session cookies may be important for cookies-based authentication,
1687 # e.g. usually, when user does not check 'Remember me' check box while
1688 # logging in on a site, some important cookies are stored as session
1689 # cookies so that not recognizing them will result in failed login.
1690 # 1. https://bugs.python.org/issue17164
1691 for cookie in self:
1692 # Treat `expires=0` cookies as session cookies
1693 if cookie.expires == 0:
1694 cookie.expires = None
1695 cookie.discard = True
1696
1697
1698 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1699 def __init__(self, cookiejar=None):
1700 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1701
1702 def http_response(self, request, response):
1703 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1704
1705 https_request = urllib.request.HTTPCookieProcessor.http_request
1706 https_response = http_response
1707
1708
1709 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1710 """YoutubeDL redirect handler
1711
1712 The code is based on HTTPRedirectHandler implementation from CPython [1].
1713
1714 This redirect handler solves two issues:
1715 - ensures redirect URL is always unicode under python 2
1716 - introduces support for experimental HTTP response status code
1717 308 Permanent Redirect [2] used by some sites [3]
1718
1719 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1720 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1721 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1722 """
1723
1724 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1725
1726 def redirect_request(self, req, fp, code, msg, headers, newurl):
1727 """Return a Request or None in response to a redirect.
1728
1729 This is called by the http_error_30x methods when a
1730 redirection response is received. If a redirection should
1731 take place, return a new Request to allow http_error_30x to
1732 perform the redirect. Otherwise, raise HTTPError if no-one
1733 else should try to handle this url. Return None if you can't
1734 but another Handler might.
1735 """
1736 m = req.get_method()
1737 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1738 or code in (301, 302, 303) and m == "POST")):
1739 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1740 # Strictly (according to RFC 2616), 301 or 302 in response to
1741 # a POST MUST NOT cause a redirection without confirmation
1742 # from the user (of urllib.request, in this case). In practice,
1743 # essentially all clients do redirect in this case, so we do
1744 # the same.
1745
1746 # Be conciliant with URIs containing a space. This is mainly
1747 # redundant with the more complete encoding done in http_error_302(),
1748 # but it is kept for compatibility with other callers.
1749 newurl = newurl.replace(' ', '%20')
1750
1751 CONTENT_HEADERS = ("content-length", "content-type")
1752 # NB: don't use dict comprehension for python 2.6 compatibility
1753 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1754
1755 # A 303 must either use GET or HEAD for subsequent request
1756 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1757 if code == 303 and m != 'HEAD':
1758 m = 'GET'
1759 # 301 and 302 redirects are commonly turned into a GET from a POST
1760 # for subsequent requests by browsers, so we'll do the same.
1761 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1762 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1763 if code in (301, 302) and m == 'POST':
1764 m = 'GET'
1765
1766 return urllib.request.Request(
1767 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1768 unverifiable=True, method=m)
1769
1770
1771 def extract_timezone(date_str):
1772 m = re.search(
1773 r'''(?x)
1774 ^.{8,}? # >=8 char non-TZ prefix, if present
1775 (?P<tz>Z| # just the UTC Z, or
1776 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1777 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1778 [ ]? # optional space
1779 (?P<sign>\+|-) # +/-
1780 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1781 $)
1782 ''', date_str)
1783 if not m:
1784 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1785 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1786 if timezone is not None:
1787 date_str = date_str[:-len(m.group('tz'))]
1788 timezone = datetime.timedelta(hours=timezone or 0)
1789 else:
1790 date_str = date_str[:-len(m.group('tz'))]
1791 if not m.group('sign'):
1792 timezone = datetime.timedelta()
1793 else:
1794 sign = 1 if m.group('sign') == '+' else -1
1795 timezone = datetime.timedelta(
1796 hours=sign * int(m.group('hours')),
1797 minutes=sign * int(m.group('minutes')))
1798 return timezone, date_str
1799
1800
1801 def parse_iso8601(date_str, delimiter='T', timezone=None):
1802 """ Return a UNIX timestamp from the given date """
1803
1804 if date_str is None:
1805 return None
1806
1807 date_str = re.sub(r'\.[0-9]+', '', date_str)
1808
1809 if timezone is None:
1810 timezone, date_str = extract_timezone(date_str)
1811
1812 with contextlib.suppress(ValueError):
1813 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1814 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1815 return calendar.timegm(dt.timetuple())
1816
1817
1818 def date_formats(day_first=True):
1819 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1820
1821
1822 def unified_strdate(date_str, day_first=True):
1823 """Return a string with the date in the format YYYYMMDD"""
1824
1825 if date_str is None:
1826 return None
1827 upload_date = None
1828 # Replace commas
1829 date_str = date_str.replace(',', ' ')
1830 # Remove AM/PM + timezone
1831 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1832 _, date_str = extract_timezone(date_str)
1833
1834 for expression in date_formats(day_first):
1835 with contextlib.suppress(ValueError):
1836 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1837 if upload_date is None:
1838 timetuple = email.utils.parsedate_tz(date_str)
1839 if timetuple:
1840 with contextlib.suppress(ValueError):
1841 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1842 if upload_date is not None:
1843 return str(upload_date)
1844
1845
1846 def unified_timestamp(date_str, day_first=True):
1847 if date_str is None:
1848 return None
1849
1850 date_str = re.sub(r'\s+', ' ', re.sub(
1851 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1852
1853 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1854 timezone, date_str = extract_timezone(date_str)
1855
1856 # Remove AM/PM + timezone
1857 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1858
1859 # Remove unrecognized timezones from ISO 8601 alike timestamps
1860 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1861 if m:
1862 date_str = date_str[:-len(m.group('tz'))]
1863
1864 # Python only supports microseconds, so remove nanoseconds
1865 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1866 if m:
1867 date_str = m.group(1)
1868
1869 for expression in date_formats(day_first):
1870 with contextlib.suppress(ValueError):
1871 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1872 return calendar.timegm(dt.timetuple())
1873
1874 timetuple = email.utils.parsedate_tz(date_str)
1875 if timetuple:
1876 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1877
1878
1879 def determine_ext(url, default_ext='unknown_video'):
1880 if url is None or '.' not in url:
1881 return default_ext
1882 guess = url.partition('?')[0].rpartition('.')[2]
1883 if re.match(r'^[A-Za-z0-9]+$', guess):
1884 return guess
1885 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1886 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1887 return guess.rstrip('/')
1888 else:
1889 return default_ext
1890
1891
1892 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1893 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1894
1895
1896 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1897 R"""
1898 Return a datetime object from a string.
1899 Supported format:
1900 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1901
1902 @param format strftime format of DATE
1903 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1904 auto: round to the unit provided in date_str (if applicable).
1905 """
1906 auto_precision = False
1907 if precision == 'auto':
1908 auto_precision = True
1909 precision = 'microsecond'
1910 today = datetime_round(datetime.datetime.utcnow(), precision)
1911 if date_str in ('now', 'today'):
1912 return today
1913 if date_str == 'yesterday':
1914 return today - datetime.timedelta(days=1)
1915 match = re.match(
1916 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1917 date_str)
1918 if match is not None:
1919 start_time = datetime_from_str(match.group('start'), precision, format)
1920 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1921 unit = match.group('unit')
1922 if unit == 'month' or unit == 'year':
1923 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1924 unit = 'day'
1925 else:
1926 if unit == 'week':
1927 unit = 'day'
1928 time *= 7
1929 delta = datetime.timedelta(**{unit + 's': time})
1930 new_date = start_time + delta
1931 if auto_precision:
1932 return datetime_round(new_date, unit)
1933 return new_date
1934
1935 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1936
1937
1938 def date_from_str(date_str, format='%Y%m%d', strict=False):
1939 R"""
1940 Return a date object from a string using datetime_from_str
1941
1942 @param strict Restrict allowed patterns to "YYYYMMDD" and
1943 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1944 """
1945 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1946 raise ValueError(f'Invalid date format "{date_str}"')
1947 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1948
1949
1950 def datetime_add_months(dt, months):
1951 """Increment/Decrement a datetime object by months."""
1952 month = dt.month + months - 1
1953 year = dt.year + month // 12
1954 month = month % 12 + 1
1955 day = min(dt.day, calendar.monthrange(year, month)[1])
1956 return dt.replace(year, month, day)
1957
1958
1959 def datetime_round(dt, precision='day'):
1960 """
1961 Round a datetime object's time to a specific precision
1962 """
1963 if precision == 'microsecond':
1964 return dt
1965
1966 unit_seconds = {
1967 'day': 86400,
1968 'hour': 3600,
1969 'minute': 60,
1970 'second': 1,
1971 }
1972 roundto = lambda x, n: ((x + n / 2) // n) * n
1973 timestamp = calendar.timegm(dt.timetuple())
1974 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1975
1976
1977 def hyphenate_date(date_str):
1978 """
1979 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1980 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1981 if match is not None:
1982 return '-'.join(match.groups())
1983 else:
1984 return date_str
1985
1986
1987 class DateRange:
1988 """Represents a time interval between two dates"""
1989
1990 def __init__(self, start=None, end=None):
1991 """start and end must be strings in the format accepted by date"""
1992 if start is not None:
1993 self.start = date_from_str(start, strict=True)
1994 else:
1995 self.start = datetime.datetime.min.date()
1996 if end is not None:
1997 self.end = date_from_str(end, strict=True)
1998 else:
1999 self.end = datetime.datetime.max.date()
2000 if self.start > self.end:
2001 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
2002
2003 @classmethod
2004 def day(cls, day):
2005 """Returns a range that only contains the given day"""
2006 return cls(day, day)
2007
2008 def __contains__(self, date):
2009 """Check if the date is in the range"""
2010 if not isinstance(date, datetime.date):
2011 date = date_from_str(date)
2012 return self.start <= date <= self.end
2013
2014 def __str__(self):
2015 return f'{self.start.isoformat()} - {self.end.isoformat()}'
2016
2017 def __eq__(self, other):
2018 return (isinstance(other, DateRange)
2019 and self.start == other.start and self.end == other.end)
2020
2021
2022 def platform_name():
2023 """ Returns the platform name as a str """
2024 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
2025 return platform.platform()
2026
2027
2028 @functools.cache
2029 def system_identifier():
2030 python_implementation = platform.python_implementation()
2031 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2032 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
2033 libc_ver = []
2034 with contextlib.suppress(OSError): # We may not have access to the executable
2035 libc_ver = platform.libc_ver()
2036
2037 return 'Python %s (%s %s %s) - %s (%s%s)' % (
2038 platform.python_version(),
2039 python_implementation,
2040 platform.machine(),
2041 platform.architecture()[0],
2042 platform.platform(),
2043 ssl.OPENSSL_VERSION,
2044 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
2045 )
2046
2047
2048 @functools.cache
2049 def get_windows_version():
2050 ''' Get Windows version. returns () if it's not running on Windows '''
2051 if compat_os_name == 'nt':
2052 return version_tuple(platform.win32_ver()[1])
2053 else:
2054 return ()
2055
2056
2057 def write_string(s, out=None, encoding=None):
2058 assert isinstance(s, str)
2059 out = out or sys.stderr
2060 # `sys.stderr` might be `None` (Ref: https://github.com/pyinstaller/pyinstaller/pull/7217)
2061 if not out:
2062 return
2063
2064 if compat_os_name == 'nt' and supports_terminal_sequences(out):
2065 s = re.sub(r'([\r\n]+)', r' \1', s)
2066
2067 enc, buffer = None, out
2068 if 'b' in getattr(out, 'mode', ''):
2069 enc = encoding or preferredencoding()
2070 elif hasattr(out, 'buffer'):
2071 buffer = out.buffer
2072 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2073
2074 buffer.write(s.encode(enc, 'ignore') if enc else s)
2075 out.flush()
2076
2077
2078 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2079 from . import _IN_CLI
2080 if _IN_CLI:
2081 if msg in deprecation_warning._cache:
2082 return
2083 deprecation_warning._cache.add(msg)
2084 if printer:
2085 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2086 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2087 else:
2088 import warnings
2089 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2090
2091
2092 deprecation_warning._cache = set()
2093
2094
2095 def bytes_to_intlist(bs):
2096 if not bs:
2097 return []
2098 if isinstance(bs[0], int): # Python 3
2099 return list(bs)
2100 else:
2101 return [ord(c) for c in bs]
2102
2103
2104 def intlist_to_bytes(xs):
2105 if not xs:
2106 return b''
2107 return struct.pack('%dB' % len(xs), *xs)
2108
2109
2110 class LockingUnsupportedError(OSError):
2111 msg = 'File locking is not supported'
2112
2113 def __init__(self):
2114 super().__init__(self.msg)
2115
2116
2117 # Cross-platform file locking
2118 if sys.platform == 'win32':
2119 import ctypes
2120 import ctypes.wintypes
2121 import msvcrt
2122
2123 class OVERLAPPED(ctypes.Structure):
2124 _fields_ = [
2125 ('Internal', ctypes.wintypes.LPVOID),
2126 ('InternalHigh', ctypes.wintypes.LPVOID),
2127 ('Offset', ctypes.wintypes.DWORD),
2128 ('OffsetHigh', ctypes.wintypes.DWORD),
2129 ('hEvent', ctypes.wintypes.HANDLE),
2130 ]
2131
2132 kernel32 = ctypes.WinDLL('kernel32')
2133 LockFileEx = kernel32.LockFileEx
2134 LockFileEx.argtypes = [
2135 ctypes.wintypes.HANDLE, # hFile
2136 ctypes.wintypes.DWORD, # dwFlags
2137 ctypes.wintypes.DWORD, # dwReserved
2138 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2139 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2140 ctypes.POINTER(OVERLAPPED) # Overlapped
2141 ]
2142 LockFileEx.restype = ctypes.wintypes.BOOL
2143 UnlockFileEx = kernel32.UnlockFileEx
2144 UnlockFileEx.argtypes = [
2145 ctypes.wintypes.HANDLE, # hFile
2146 ctypes.wintypes.DWORD, # dwReserved
2147 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2148 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2149 ctypes.POINTER(OVERLAPPED) # Overlapped
2150 ]
2151 UnlockFileEx.restype = ctypes.wintypes.BOOL
2152 whole_low = 0xffffffff
2153 whole_high = 0x7fffffff
2154
2155 def _lock_file(f, exclusive, block):
2156 overlapped = OVERLAPPED()
2157 overlapped.Offset = 0
2158 overlapped.OffsetHigh = 0
2159 overlapped.hEvent = 0
2160 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2161
2162 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2163 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2164 0, whole_low, whole_high, f._lock_file_overlapped_p):
2165 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2166 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2167
2168 def _unlock_file(f):
2169 assert f._lock_file_overlapped_p
2170 handle = msvcrt.get_osfhandle(f.fileno())
2171 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2172 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2173
2174 else:
2175 try:
2176 import fcntl
2177
2178 def _lock_file(f, exclusive, block):
2179 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2180 if not block:
2181 flags |= fcntl.LOCK_NB
2182 try:
2183 fcntl.flock(f, flags)
2184 except BlockingIOError:
2185 raise
2186 except OSError: # AOSP does not have flock()
2187 fcntl.lockf(f, flags)
2188
2189 def _unlock_file(f):
2190 with contextlib.suppress(OSError):
2191 return fcntl.flock(f, fcntl.LOCK_UN)
2192 with contextlib.suppress(OSError):
2193 return fcntl.lockf(f, fcntl.LOCK_UN) # AOSP does not have flock()
2194 return fcntl.flock(f, fcntl.LOCK_UN | fcntl.LOCK_NB) # virtiofs needs LOCK_NB on unlocking
2195
2196 except ImportError:
2197
2198 def _lock_file(f, exclusive, block):
2199 raise LockingUnsupportedError()
2200
2201 def _unlock_file(f):
2202 raise LockingUnsupportedError()
2203
2204
2205 class locked_file:
2206 locked = False
2207
2208 def __init__(self, filename, mode, block=True, encoding=None):
2209 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2210 raise NotImplementedError(mode)
2211 self.mode, self.block = mode, block
2212
2213 writable = any(f in mode for f in 'wax+')
2214 readable = any(f in mode for f in 'r+')
2215 flags = functools.reduce(operator.ior, (
2216 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2217 getattr(os, 'O_BINARY', 0), # Windows only
2218 getattr(os, 'O_NOINHERIT', 0), # Windows only
2219 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2220 os.O_APPEND if 'a' in mode else 0,
2221 os.O_EXCL if 'x' in mode else 0,
2222 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2223 ))
2224
2225 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2226
2227 def __enter__(self):
2228 exclusive = 'r' not in self.mode
2229 try:
2230 _lock_file(self.f, exclusive, self.block)
2231 self.locked = True
2232 except OSError:
2233 self.f.close()
2234 raise
2235 if 'w' in self.mode:
2236 try:
2237 self.f.truncate()
2238 except OSError as e:
2239 if e.errno not in (
2240 errno.ESPIPE, # Illegal seek - expected for FIFO
2241 errno.EINVAL, # Invalid argument - expected for /dev/null
2242 ):
2243 raise
2244 return self
2245
2246 def unlock(self):
2247 if not self.locked:
2248 return
2249 try:
2250 _unlock_file(self.f)
2251 finally:
2252 self.locked = False
2253
2254 def __exit__(self, *_):
2255 try:
2256 self.unlock()
2257 finally:
2258 self.f.close()
2259
2260 open = __enter__
2261 close = __exit__
2262
2263 def __getattr__(self, attr):
2264 return getattr(self.f, attr)
2265
2266 def __iter__(self):
2267 return iter(self.f)
2268
2269
2270 @functools.cache
2271 def get_filesystem_encoding():
2272 encoding = sys.getfilesystemencoding()
2273 return encoding if encoding is not None else 'utf-8'
2274
2275
2276 def shell_quote(args):
2277 quoted_args = []
2278 encoding = get_filesystem_encoding()
2279 for a in args:
2280 if isinstance(a, bytes):
2281 # We may get a filename encoded with 'encodeFilename'
2282 a = a.decode(encoding)
2283 quoted_args.append(compat_shlex_quote(a))
2284 return ' '.join(quoted_args)
2285
2286
2287 def smuggle_url(url, data):
2288 """ Pass additional data in a URL for internal use. """
2289
2290 url, idata = unsmuggle_url(url, {})
2291 data.update(idata)
2292 sdata = urllib.parse.urlencode(
2293 {'__youtubedl_smuggle': json.dumps(data)})
2294 return url + '#' + sdata
2295
2296
2297 def unsmuggle_url(smug_url, default=None):
2298 if '#__youtubedl_smuggle' not in smug_url:
2299 return smug_url, default
2300 url, _, sdata = smug_url.rpartition('#')
2301 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2302 data = json.loads(jsond)
2303 return url, data
2304
2305
2306 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2307 """ Formats numbers with decimal sufixes like K, M, etc """
2308 num, factor = float_or_none(num), float(factor)
2309 if num is None or num < 0:
2310 return None
2311 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2312 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2313 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2314 if factor == 1024:
2315 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2316 converted = num / (factor ** exponent)
2317 return fmt % (converted, suffix)
2318
2319
2320 def format_bytes(bytes):
2321 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2322
2323
2324 def lookup_unit_table(unit_table, s, strict=False):
2325 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2326 units_re = '|'.join(re.escape(u) for u in unit_table)
2327 m = (re.fullmatch if strict else re.match)(
2328 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2329 if not m:
2330 return None
2331
2332 num = float(m.group('num').replace(',', '.'))
2333 mult = unit_table[m.group('unit')]
2334 return round(num * mult)
2335
2336
2337 def parse_bytes(s):
2338 """Parse a string indicating a byte quantity into an integer"""
2339 return lookup_unit_table(
2340 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2341 s.upper(), strict=True)
2342
2343
2344 def parse_filesize(s):
2345 if s is None:
2346 return None
2347
2348 # The lower-case forms are of course incorrect and unofficial,
2349 # but we support those too
2350 _UNIT_TABLE = {
2351 'B': 1,
2352 'b': 1,
2353 'bytes': 1,
2354 'KiB': 1024,
2355 'KB': 1000,
2356 'kB': 1024,
2357 'Kb': 1000,
2358 'kb': 1000,
2359 'kilobytes': 1000,
2360 'kibibytes': 1024,
2361 'MiB': 1024 ** 2,
2362 'MB': 1000 ** 2,
2363 'mB': 1024 ** 2,
2364 'Mb': 1000 ** 2,
2365 'mb': 1000 ** 2,
2366 'megabytes': 1000 ** 2,
2367 'mebibytes': 1024 ** 2,
2368 'GiB': 1024 ** 3,
2369 'GB': 1000 ** 3,
2370 'gB': 1024 ** 3,
2371 'Gb': 1000 ** 3,
2372 'gb': 1000 ** 3,
2373 'gigabytes': 1000 ** 3,
2374 'gibibytes': 1024 ** 3,
2375 'TiB': 1024 ** 4,
2376 'TB': 1000 ** 4,
2377 'tB': 1024 ** 4,
2378 'Tb': 1000 ** 4,
2379 'tb': 1000 ** 4,
2380 'terabytes': 1000 ** 4,
2381 'tebibytes': 1024 ** 4,
2382 'PiB': 1024 ** 5,
2383 'PB': 1000 ** 5,
2384 'pB': 1024 ** 5,
2385 'Pb': 1000 ** 5,
2386 'pb': 1000 ** 5,
2387 'petabytes': 1000 ** 5,
2388 'pebibytes': 1024 ** 5,
2389 'EiB': 1024 ** 6,
2390 'EB': 1000 ** 6,
2391 'eB': 1024 ** 6,
2392 'Eb': 1000 ** 6,
2393 'eb': 1000 ** 6,
2394 'exabytes': 1000 ** 6,
2395 'exbibytes': 1024 ** 6,
2396 'ZiB': 1024 ** 7,
2397 'ZB': 1000 ** 7,
2398 'zB': 1024 ** 7,
2399 'Zb': 1000 ** 7,
2400 'zb': 1000 ** 7,
2401 'zettabytes': 1000 ** 7,
2402 'zebibytes': 1024 ** 7,
2403 'YiB': 1024 ** 8,
2404 'YB': 1000 ** 8,
2405 'yB': 1024 ** 8,
2406 'Yb': 1000 ** 8,
2407 'yb': 1000 ** 8,
2408 'yottabytes': 1000 ** 8,
2409 'yobibytes': 1024 ** 8,
2410 }
2411
2412 return lookup_unit_table(_UNIT_TABLE, s)
2413
2414
2415 def parse_count(s):
2416 if s is None:
2417 return None
2418
2419 s = re.sub(r'^[^\d]+\s', '', s).strip()
2420
2421 if re.match(r'^[\d,.]+$', s):
2422 return str_to_int(s)
2423
2424 _UNIT_TABLE = {
2425 'k': 1000,
2426 'K': 1000,
2427 'm': 1000 ** 2,
2428 'M': 1000 ** 2,
2429 'kk': 1000 ** 2,
2430 'KK': 1000 ** 2,
2431 'b': 1000 ** 3,
2432 'B': 1000 ** 3,
2433 }
2434
2435 ret = lookup_unit_table(_UNIT_TABLE, s)
2436 if ret is not None:
2437 return ret
2438
2439 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2440 if mobj:
2441 return str_to_int(mobj.group(1))
2442
2443
2444 def parse_resolution(s, *, lenient=False):
2445 if s is None:
2446 return {}
2447
2448 if lenient:
2449 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2450 else:
2451 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2452 if mobj:
2453 return {
2454 'width': int(mobj.group('w')),
2455 'height': int(mobj.group('h')),
2456 }
2457
2458 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2459 if mobj:
2460 return {'height': int(mobj.group(1))}
2461
2462 mobj = re.search(r'\b([48])[kK]\b', s)
2463 if mobj:
2464 return {'height': int(mobj.group(1)) * 540}
2465
2466 return {}
2467
2468
2469 def parse_bitrate(s):
2470 if not isinstance(s, str):
2471 return
2472 mobj = re.search(r'\b(\d+)\s*kbps', s)
2473 if mobj:
2474 return int(mobj.group(1))
2475
2476
2477 def month_by_name(name, lang='en'):
2478 """ Return the number of a month by (locale-independently) English name """
2479
2480 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2481
2482 try:
2483 return month_names.index(name) + 1
2484 except ValueError:
2485 return None
2486
2487
2488 def month_by_abbreviation(abbrev):
2489 """ Return the number of a month by (locale-independently) English
2490 abbreviations """
2491
2492 try:
2493 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2494 except ValueError:
2495 return None
2496
2497
2498 def fix_xml_ampersands(xml_str):
2499 """Replace all the '&' by '&amp;' in XML"""
2500 return re.sub(
2501 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2502 '&amp;',
2503 xml_str)
2504
2505
2506 def setproctitle(title):
2507 assert isinstance(title, str)
2508
2509 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2510 try:
2511 import ctypes
2512 except ImportError:
2513 return
2514
2515 try:
2516 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2517 except OSError:
2518 return
2519 except TypeError:
2520 # LoadLibrary in Windows Python 2.7.13 only expects
2521 # a bytestring, but since unicode_literals turns
2522 # every string into a unicode string, it fails.
2523 return
2524 title_bytes = title.encode()
2525 buf = ctypes.create_string_buffer(len(title_bytes))
2526 buf.value = title_bytes
2527 try:
2528 libc.prctl(15, buf, 0, 0, 0)
2529 except AttributeError:
2530 return # Strange libc, just skip this
2531
2532
2533 def remove_start(s, start):
2534 return s[len(start):] if s is not None and s.startswith(start) else s
2535
2536
2537 def remove_end(s, end):
2538 return s[:-len(end)] if s is not None and s.endswith(end) else s
2539
2540
2541 def remove_quotes(s):
2542 if s is None or len(s) < 2:
2543 return s
2544 for quote in ('"', "'", ):
2545 if s[0] == quote and s[-1] == quote:
2546 return s[1:-1]
2547 return s
2548
2549
2550 def get_domain(url):
2551 """
2552 This implementation is inconsistent, but is kept for compatibility.
2553 Use this only for "webpage_url_domain"
2554 """
2555 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2556
2557
2558 def url_basename(url):
2559 path = urllib.parse.urlparse(url).path
2560 return path.strip('/').split('/')[-1]
2561
2562
2563 def base_url(url):
2564 return re.match(r'https?://[^?#]+/', url).group()
2565
2566
2567 def urljoin(base, path):
2568 if isinstance(path, bytes):
2569 path = path.decode()
2570 if not isinstance(path, str) or not path:
2571 return None
2572 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2573 return path
2574 if isinstance(base, bytes):
2575 base = base.decode()
2576 if not isinstance(base, str) or not re.match(
2577 r'^(?:https?:)?//', base):
2578 return None
2579 return urllib.parse.urljoin(base, path)
2580
2581
2582 class HEADRequest(urllib.request.Request):
2583 def get_method(self):
2584 return 'HEAD'
2585
2586
2587 class PUTRequest(urllib.request.Request):
2588 def get_method(self):
2589 return 'PUT'
2590
2591
2592 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2593 if get_attr and v is not None:
2594 v = getattr(v, get_attr, None)
2595 try:
2596 return int(v) * invscale // scale
2597 except (ValueError, TypeError, OverflowError):
2598 return default
2599
2600
2601 def str_or_none(v, default=None):
2602 return default if v is None else str(v)
2603
2604
2605 def str_to_int(int_str):
2606 """ A more relaxed version of int_or_none """
2607 if isinstance(int_str, int):
2608 return int_str
2609 elif isinstance(int_str, str):
2610 int_str = re.sub(r'[,\.\+]', '', int_str)
2611 return int_or_none(int_str)
2612
2613
2614 def float_or_none(v, scale=1, invscale=1, default=None):
2615 if v is None:
2616 return default
2617 try:
2618 return float(v) * invscale / scale
2619 except (ValueError, TypeError):
2620 return default
2621
2622
2623 def bool_or_none(v, default=None):
2624 return v if isinstance(v, bool) else default
2625
2626
2627 def strip_or_none(v, default=None):
2628 return v.strip() if isinstance(v, str) else default
2629
2630
2631 def url_or_none(url):
2632 if not url or not isinstance(url, str):
2633 return None
2634 url = url.strip()
2635 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2636
2637
2638 def request_to_url(req):
2639 if isinstance(req, urllib.request.Request):
2640 return req.get_full_url()
2641 else:
2642 return req
2643
2644
2645 def strftime_or_none(timestamp, date_format, default=None):
2646 datetime_object = None
2647 try:
2648 if isinstance(timestamp, (int, float)): # unix timestamp
2649 # Using naive datetime here can break timestamp() in Windows
2650 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2651 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2652 elif isinstance(timestamp, str): # assume YYYYMMDD
2653 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2654 date_format = re.sub( # Support %s on windows
2655 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2656 return datetime_object.strftime(date_format)
2657 except (ValueError, TypeError, AttributeError):
2658 return default
2659
2660
2661 def parse_duration(s):
2662 if not isinstance(s, str):
2663 return None
2664 s = s.strip()
2665 if not s:
2666 return None
2667
2668 days, hours, mins, secs, ms = [None] * 5
2669 m = re.match(r'''(?x)
2670 (?P<before_secs>
2671 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2672 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2673 (?P<ms>[.:][0-9]+)?Z?$
2674 ''', s)
2675 if m:
2676 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2677 else:
2678 m = re.match(
2679 r'''(?ix)(?:P?
2680 (?:
2681 [0-9]+\s*y(?:ears?)?,?\s*
2682 )?
2683 (?:
2684 [0-9]+\s*m(?:onths?)?,?\s*
2685 )?
2686 (?:
2687 [0-9]+\s*w(?:eeks?)?,?\s*
2688 )?
2689 (?:
2690 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2691 )?
2692 T)?
2693 (?:
2694 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2695 )?
2696 (?:
2697 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2698 )?
2699 (?:
2700 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2701 )?Z?$''', s)
2702 if m:
2703 days, hours, mins, secs, ms = m.groups()
2704 else:
2705 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2706 if m:
2707 hours, mins = m.groups()
2708 else:
2709 return None
2710
2711 if ms:
2712 ms = ms.replace(':', '.')
2713 return sum(float(part or 0) * mult for part, mult in (
2714 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2715
2716
2717 def prepend_extension(filename, ext, expected_real_ext=None):
2718 name, real_ext = os.path.splitext(filename)
2719 return (
2720 f'{name}.{ext}{real_ext}'
2721 if not expected_real_ext or real_ext[1:] == expected_real_ext
2722 else f'{filename}.{ext}')
2723
2724
2725 def replace_extension(filename, ext, expected_real_ext=None):
2726 name, real_ext = os.path.splitext(filename)
2727 return '{}.{}'.format(
2728 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2729 ext)
2730
2731
2732 def check_executable(exe, args=[]):
2733 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2734 args can be a list of arguments for a short output (like -version) """
2735 try:
2736 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2737 except OSError:
2738 return False
2739 return exe
2740
2741
2742 def _get_exe_version_output(exe, args):
2743 try:
2744 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2745 # SIGTTOU if yt-dlp is run in the background.
2746 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2747 stdout, _, ret = Popen.run([encodeArgument(exe)] + args, text=True,
2748 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2749 if ret:
2750 return None
2751 except OSError:
2752 return False
2753 return stdout
2754
2755
2756 def detect_exe_version(output, version_re=None, unrecognized='present'):
2757 assert isinstance(output, str)
2758 if version_re is None:
2759 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2760 m = re.search(version_re, output)
2761 if m:
2762 return m.group(1)
2763 else:
2764 return unrecognized
2765
2766
2767 def get_exe_version(exe, args=['--version'],
2768 version_re=None, unrecognized=('present', 'broken')):
2769 """ Returns the version of the specified executable,
2770 or False if the executable is not present """
2771 unrecognized = variadic(unrecognized)
2772 assert len(unrecognized) in (1, 2)
2773 out = _get_exe_version_output(exe, args)
2774 if out is None:
2775 return unrecognized[-1]
2776 return out and detect_exe_version(out, version_re, unrecognized[0])
2777
2778
2779 def frange(start=0, stop=None, step=1):
2780 """Float range"""
2781 if stop is None:
2782 start, stop = 0, start
2783 sign = [-1, 1][step > 0] if step else 0
2784 while sign * start < sign * stop:
2785 yield start
2786 start += step
2787
2788
2789 class LazyList(collections.abc.Sequence):
2790 """Lazy immutable list from an iterable
2791 Note that slices of a LazyList are lists and not LazyList"""
2792
2793 class IndexError(IndexError):
2794 pass
2795
2796 def __init__(self, iterable, *, reverse=False, _cache=None):
2797 self._iterable = iter(iterable)
2798 self._cache = [] if _cache is None else _cache
2799 self._reversed = reverse
2800
2801 def __iter__(self):
2802 if self._reversed:
2803 # We need to consume the entire iterable to iterate in reverse
2804 yield from self.exhaust()
2805 return
2806 yield from self._cache
2807 for item in self._iterable:
2808 self._cache.append(item)
2809 yield item
2810
2811 def _exhaust(self):
2812 self._cache.extend(self._iterable)
2813 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2814 return self._cache
2815
2816 def exhaust(self):
2817 """Evaluate the entire iterable"""
2818 return self._exhaust()[::-1 if self._reversed else 1]
2819
2820 @staticmethod
2821 def _reverse_index(x):
2822 return None if x is None else ~x
2823
2824 def __getitem__(self, idx):
2825 if isinstance(idx, slice):
2826 if self._reversed:
2827 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2828 start, stop, step = idx.start, idx.stop, idx.step or 1
2829 elif isinstance(idx, int):
2830 if self._reversed:
2831 idx = self._reverse_index(idx)
2832 start, stop, step = idx, idx, 0
2833 else:
2834 raise TypeError('indices must be integers or slices')
2835 if ((start or 0) < 0 or (stop or 0) < 0
2836 or (start is None and step < 0)
2837 or (stop is None and step > 0)):
2838 # We need to consume the entire iterable to be able to slice from the end
2839 # Obviously, never use this with infinite iterables
2840 self._exhaust()
2841 try:
2842 return self._cache[idx]
2843 except IndexError as e:
2844 raise self.IndexError(e) from e
2845 n = max(start or 0, stop or 0) - len(self._cache) + 1
2846 if n > 0:
2847 self._cache.extend(itertools.islice(self._iterable, n))
2848 try:
2849 return self._cache[idx]
2850 except IndexError as e:
2851 raise self.IndexError(e) from e
2852
2853 def __bool__(self):
2854 try:
2855 self[-1] if self._reversed else self[0]
2856 except self.IndexError:
2857 return False
2858 return True
2859
2860 def __len__(self):
2861 self._exhaust()
2862 return len(self._cache)
2863
2864 def __reversed__(self):
2865 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2866
2867 def __copy__(self):
2868 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2869
2870 def __repr__(self):
2871 # repr and str should mimic a list. So we exhaust the iterable
2872 return repr(self.exhaust())
2873
2874 def __str__(self):
2875 return repr(self.exhaust())
2876
2877
2878 class PagedList:
2879
2880 class IndexError(IndexError):
2881 pass
2882
2883 def __len__(self):
2884 # This is only useful for tests
2885 return len(self.getslice())
2886
2887 def __init__(self, pagefunc, pagesize, use_cache=True):
2888 self._pagefunc = pagefunc
2889 self._pagesize = pagesize
2890 self._pagecount = float('inf')
2891 self._use_cache = use_cache
2892 self._cache = {}
2893
2894 def getpage(self, pagenum):
2895 page_results = self._cache.get(pagenum)
2896 if page_results is None:
2897 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2898 if self._use_cache:
2899 self._cache[pagenum] = page_results
2900 return page_results
2901
2902 def getslice(self, start=0, end=None):
2903 return list(self._getslice(start, end))
2904
2905 def _getslice(self, start, end):
2906 raise NotImplementedError('This method must be implemented by subclasses')
2907
2908 def __getitem__(self, idx):
2909 assert self._use_cache, 'Indexing PagedList requires cache'
2910 if not isinstance(idx, int) or idx < 0:
2911 raise TypeError('indices must be non-negative integers')
2912 entries = self.getslice(idx, idx + 1)
2913 if not entries:
2914 raise self.IndexError()
2915 return entries[0]
2916
2917
2918 class OnDemandPagedList(PagedList):
2919 """Download pages until a page with less than maximum results"""
2920
2921 def _getslice(self, start, end):
2922 for pagenum in itertools.count(start // self._pagesize):
2923 firstid = pagenum * self._pagesize
2924 nextfirstid = pagenum * self._pagesize + self._pagesize
2925 if start >= nextfirstid:
2926 continue
2927
2928 startv = (
2929 start % self._pagesize
2930 if firstid <= start < nextfirstid
2931 else 0)
2932 endv = (
2933 ((end - 1) % self._pagesize) + 1
2934 if (end is not None and firstid <= end <= nextfirstid)
2935 else None)
2936
2937 try:
2938 page_results = self.getpage(pagenum)
2939 except Exception:
2940 self._pagecount = pagenum - 1
2941 raise
2942 if startv != 0 or endv is not None:
2943 page_results = page_results[startv:endv]
2944 yield from page_results
2945
2946 # A little optimization - if current page is not "full", ie. does
2947 # not contain page_size videos then we can assume that this page
2948 # is the last one - there are no more ids on further pages -
2949 # i.e. no need to query again.
2950 if len(page_results) + startv < self._pagesize:
2951 break
2952
2953 # If we got the whole page, but the next page is not interesting,
2954 # break out early as well
2955 if end == nextfirstid:
2956 break
2957
2958
2959 class InAdvancePagedList(PagedList):
2960 """PagedList with total number of pages known in advance"""
2961
2962 def __init__(self, pagefunc, pagecount, pagesize):
2963 PagedList.__init__(self, pagefunc, pagesize, True)
2964 self._pagecount = pagecount
2965
2966 def _getslice(self, start, end):
2967 start_page = start // self._pagesize
2968 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2969 skip_elems = start - start_page * self._pagesize
2970 only_more = None if end is None else end - start
2971 for pagenum in range(start_page, end_page):
2972 page_results = self.getpage(pagenum)
2973 if skip_elems:
2974 page_results = page_results[skip_elems:]
2975 skip_elems = None
2976 if only_more is not None:
2977 if len(page_results) < only_more:
2978 only_more -= len(page_results)
2979 else:
2980 yield from page_results[:only_more]
2981 break
2982 yield from page_results
2983
2984
2985 class PlaylistEntries:
2986 MissingEntry = object()
2987 is_exhausted = False
2988
2989 def __init__(self, ydl, info_dict):
2990 self.ydl = ydl
2991
2992 # _entries must be assigned now since infodict can change during iteration
2993 entries = info_dict.get('entries')
2994 if entries is None:
2995 raise EntryNotInPlaylist('There are no entries')
2996 elif isinstance(entries, list):
2997 self.is_exhausted = True
2998
2999 requested_entries = info_dict.get('requested_entries')
3000 self.is_incomplete = requested_entries is not None
3001 if self.is_incomplete:
3002 assert self.is_exhausted
3003 self._entries = [self.MissingEntry] * max(requested_entries or [0])
3004 for i, entry in zip(requested_entries, entries):
3005 self._entries[i - 1] = entry
3006 elif isinstance(entries, (list, PagedList, LazyList)):
3007 self._entries = entries
3008 else:
3009 self._entries = LazyList(entries)
3010
3011 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
3012 (?P<start>[+-]?\d+)?
3013 (?P<range>[:-]
3014 (?P<end>[+-]?\d+|inf(?:inite)?)?
3015 (?::(?P<step>[+-]?\d+))?
3016 )?''')
3017
3018 @classmethod
3019 def parse_playlist_items(cls, string):
3020 for segment in string.split(','):
3021 if not segment:
3022 raise ValueError('There is two or more consecutive commas')
3023 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
3024 if not mobj:
3025 raise ValueError(f'{segment!r} is not a valid specification')
3026 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
3027 if int_or_none(step) == 0:
3028 raise ValueError(f'Step in {segment!r} cannot be zero')
3029 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3030
3031 def get_requested_items(self):
3032 playlist_items = self.ydl.params.get('playlist_items')
3033 playlist_start = self.ydl.params.get('playliststart', 1)
3034 playlist_end = self.ydl.params.get('playlistend')
3035 # For backwards compatibility, interpret -1 as whole list
3036 if playlist_end in (-1, None):
3037 playlist_end = ''
3038 if not playlist_items:
3039 playlist_items = f'{playlist_start}:{playlist_end}'
3040 elif playlist_start != 1 or playlist_end:
3041 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3042
3043 for index in self.parse_playlist_items(playlist_items):
3044 for i, entry in self[index]:
3045 yield i, entry
3046 if not entry:
3047 continue
3048 try:
3049 # The item may have just been added to archive. Don't break due to it
3050 if not self.ydl.params.get('lazy_playlist'):
3051 # TODO: Add auto-generated fields
3052 self.ydl._match_entry(entry, incomplete=True, silent=True)
3053 except (ExistingVideoReached, RejectedVideoReached):
3054 return
3055
3056 def get_full_count(self):
3057 if self.is_exhausted and not self.is_incomplete:
3058 return len(self)
3059 elif isinstance(self._entries, InAdvancePagedList):
3060 if self._entries._pagesize == 1:
3061 return self._entries._pagecount
3062
3063 @functools.cached_property
3064 def _getter(self):
3065 if isinstance(self._entries, list):
3066 def get_entry(i):
3067 try:
3068 entry = self._entries[i]
3069 except IndexError:
3070 entry = self.MissingEntry
3071 if not self.is_incomplete:
3072 raise self.IndexError()
3073 if entry is self.MissingEntry:
3074 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3075 return entry
3076 else:
3077 def get_entry(i):
3078 try:
3079 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3080 except (LazyList.IndexError, PagedList.IndexError):
3081 raise self.IndexError()
3082 return get_entry
3083
3084 def __getitem__(self, idx):
3085 if isinstance(idx, int):
3086 idx = slice(idx, idx)
3087
3088 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3089 step = 1 if idx.step is None else idx.step
3090 if idx.start is None:
3091 start = 0 if step > 0 else len(self) - 1
3092 else:
3093 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3094
3095 # NB: Do not call len(self) when idx == [:]
3096 if idx.stop is None:
3097 stop = 0 if step < 0 else float('inf')
3098 else:
3099 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3100 stop += [-1, 1][step > 0]
3101
3102 for i in frange(start, stop, step):
3103 if i < 0:
3104 continue
3105 try:
3106 entry = self._getter(i)
3107 except self.IndexError:
3108 self.is_exhausted = True
3109 if step > 0:
3110 break
3111 continue
3112 yield i + 1, entry
3113
3114 def __len__(self):
3115 return len(tuple(self[:]))
3116
3117 class IndexError(IndexError):
3118 pass
3119
3120
3121 def uppercase_escape(s):
3122 unicode_escape = codecs.getdecoder('unicode_escape')
3123 return re.sub(
3124 r'\\U[0-9a-fA-F]{8}',
3125 lambda m: unicode_escape(m.group(0))[0],
3126 s)
3127
3128
3129 def lowercase_escape(s):
3130 unicode_escape = codecs.getdecoder('unicode_escape')
3131 return re.sub(
3132 r'\\u[0-9a-fA-F]{4}',
3133 lambda m: unicode_escape(m.group(0))[0],
3134 s)
3135
3136
3137 def escape_rfc3986(s):
3138 """Escape non-ASCII characters as suggested by RFC 3986"""
3139 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3140
3141
3142 def escape_url(url):
3143 """Escape URL as suggested by RFC 3986"""
3144 url_parsed = urllib.parse.urlparse(url)
3145 return url_parsed._replace(
3146 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3147 path=escape_rfc3986(url_parsed.path),
3148 params=escape_rfc3986(url_parsed.params),
3149 query=escape_rfc3986(url_parsed.query),
3150 fragment=escape_rfc3986(url_parsed.fragment)
3151 ).geturl()
3152
3153
3154 def parse_qs(url, **kwargs):
3155 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3156
3157
3158 def read_batch_urls(batch_fd):
3159 def fixup(url):
3160 if not isinstance(url, str):
3161 url = url.decode('utf-8', 'replace')
3162 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3163 for bom in BOM_UTF8:
3164 if url.startswith(bom):
3165 url = url[len(bom):]
3166 url = url.lstrip()
3167 if not url or url.startswith(('#', ';', ']')):
3168 return False
3169 # "#" cannot be stripped out since it is part of the URI
3170 # However, it can be safely stripped out if following a whitespace
3171 return re.split(r'\s#', url, 1)[0].rstrip()
3172
3173 with contextlib.closing(batch_fd) as fd:
3174 return [url for url in map(fixup, fd) if url]
3175
3176
3177 def urlencode_postdata(*args, **kargs):
3178 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3179
3180
3181 def update_url(url, *, query_update=None, **kwargs):
3182 """Replace URL components specified by kwargs
3183 @param url str or parse url tuple
3184 @param query_update update query
3185 @returns str
3186 """
3187 if isinstance(url, str):
3188 if not kwargs and not query_update:
3189 return url
3190 else:
3191 url = urllib.parse.urlparse(url)
3192 if query_update:
3193 assert 'query' not in kwargs, 'query_update and query cannot be specified at the same time'
3194 kwargs['query'] = urllib.parse.urlencode({
3195 **urllib.parse.parse_qs(url.query),
3196 **query_update
3197 }, True)
3198 return urllib.parse.urlunparse(url._replace(**kwargs))
3199
3200
3201 def update_url_query(url, query):
3202 return update_url(url, query_update=query)
3203
3204
3205 def update_Request(req, url=None, data=None, headers=None, query=None):
3206 req_headers = req.headers.copy()
3207 req_headers.update(headers or {})
3208 req_data = data or req.data
3209 req_url = update_url_query(url or req.get_full_url(), query)
3210 req_get_method = req.get_method()
3211 if req_get_method == 'HEAD':
3212 req_type = HEADRequest
3213 elif req_get_method == 'PUT':
3214 req_type = PUTRequest
3215 else:
3216 req_type = urllib.request.Request
3217 new_req = req_type(
3218 req_url, data=req_data, headers=req_headers,
3219 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3220 if hasattr(req, 'timeout'):
3221 new_req.timeout = req.timeout
3222 return new_req
3223
3224
3225 def _multipart_encode_impl(data, boundary):
3226 content_type = 'multipart/form-data; boundary=%s' % boundary
3227
3228 out = b''
3229 for k, v in data.items():
3230 out += b'--' + boundary.encode('ascii') + b'\r\n'
3231 if isinstance(k, str):
3232 k = k.encode()
3233 if isinstance(v, str):
3234 v = v.encode()
3235 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3236 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3237 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3238 if boundary.encode('ascii') in content:
3239 raise ValueError('Boundary overlaps with data')
3240 out += content
3241
3242 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3243
3244 return out, content_type
3245
3246
3247 def multipart_encode(data, boundary=None):
3248 '''
3249 Encode a dict to RFC 7578-compliant form-data
3250
3251 data:
3252 A dict where keys and values can be either Unicode or bytes-like
3253 objects.
3254 boundary:
3255 If specified a Unicode object, it's used as the boundary. Otherwise
3256 a random boundary is generated.
3257
3258 Reference: https://tools.ietf.org/html/rfc7578
3259 '''
3260 has_specified_boundary = boundary is not None
3261
3262 while True:
3263 if boundary is None:
3264 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3265
3266 try:
3267 out, content_type = _multipart_encode_impl(data, boundary)
3268 break
3269 except ValueError:
3270 if has_specified_boundary:
3271 raise
3272 boundary = None
3273
3274 return out, content_type
3275
3276
3277 def is_iterable_like(x, allowed_types=collections.abc.Iterable, blocked_types=NO_DEFAULT):
3278 if blocked_types is NO_DEFAULT:
3279 blocked_types = (str, bytes, collections.abc.Mapping)
3280 return isinstance(x, allowed_types) and not isinstance(x, blocked_types)
3281
3282
3283 def variadic(x, allowed_types=NO_DEFAULT):
3284 return x if is_iterable_like(x, blocked_types=allowed_types) else (x, )
3285
3286
3287 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3288 for val in map(d.get, variadic(key_or_keys)):
3289 if val is not None and (val or not skip_false_values):
3290 return val
3291 return default
3292
3293
3294 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3295 for f in funcs:
3296 try:
3297 val = f(*args, **kwargs)
3298 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3299 pass
3300 else:
3301 if expected_type is None or isinstance(val, expected_type):
3302 return val
3303
3304
3305 def try_get(src, getter, expected_type=None):
3306 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3307
3308
3309 def filter_dict(dct, cndn=lambda _, v: v is not None):
3310 return {k: v for k, v in dct.items() if cndn(k, v)}
3311
3312
3313 def merge_dicts(*dicts):
3314 merged = {}
3315 for a_dict in dicts:
3316 for k, v in a_dict.items():
3317 if (v is not None and k not in merged
3318 or isinstance(v, str) and merged[k] == ''):
3319 merged[k] = v
3320 return merged
3321
3322
3323 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3324 return string if isinstance(string, str) else str(string, encoding, errors)
3325
3326
3327 US_RATINGS = {
3328 'G': 0,
3329 'PG': 10,
3330 'PG-13': 13,
3331 'R': 16,
3332 'NC': 18,
3333 }
3334
3335
3336 TV_PARENTAL_GUIDELINES = {
3337 'TV-Y': 0,
3338 'TV-Y7': 7,
3339 'TV-G': 0,
3340 'TV-PG': 0,
3341 'TV-14': 14,
3342 'TV-MA': 17,
3343 }
3344
3345
3346 def parse_age_limit(s):
3347 # isinstance(False, int) is True. So type() must be used instead
3348 if type(s) is int: # noqa: E721
3349 return s if 0 <= s <= 21 else None
3350 elif not isinstance(s, str):
3351 return None
3352 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3353 if m:
3354 return int(m.group('age'))
3355 s = s.upper()
3356 if s in US_RATINGS:
3357 return US_RATINGS[s]
3358 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3359 if m:
3360 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3361 return None
3362
3363
3364 def strip_jsonp(code):
3365 return re.sub(
3366 r'''(?sx)^
3367 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3368 (?:\s*&&\s*(?P=func_name))?
3369 \s*\(\s*(?P<callback_data>.*)\);?
3370 \s*?(?://[^\n]*)*$''',
3371 r'\g<callback_data>', code)
3372
3373
3374 def js_to_json(code, vars={}, *, strict=False):
3375 # vars is a dict of var, val pairs to substitute
3376 STRING_QUOTES = '\'"`'
3377 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3378 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3379 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3380 INTEGER_TABLE = (
3381 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3382 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3383 )
3384
3385 def process_escape(match):
3386 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3387 escape = match.group(1) or match.group(2)
3388
3389 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3390 else R'\u00' if escape == 'x'
3391 else '' if escape == '\n'
3392 else escape)
3393
3394 def template_substitute(match):
3395 evaluated = js_to_json(match.group(1), vars, strict=strict)
3396 if evaluated[0] == '"':
3397 return json.loads(evaluated)
3398 return evaluated
3399
3400 def fix_kv(m):
3401 v = m.group(0)
3402 if v in ('true', 'false', 'null'):
3403 return v
3404 elif v in ('undefined', 'void 0'):
3405 return 'null'
3406 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3407 return ''
3408
3409 if v[0] in STRING_QUOTES:
3410 v = re.sub(r'(?s)\${([^}]+)}', template_substitute, v[1:-1]) if v[0] == '`' else v[1:-1]
3411 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v)
3412 return f'"{escaped}"'
3413
3414 for regex, base in INTEGER_TABLE:
3415 im = re.match(regex, v)
3416 if im:
3417 i = int(im.group(1), base)
3418 return f'"{i}":' if v.endswith(':') else str(i)
3419
3420 if v in vars:
3421 try:
3422 if not strict:
3423 json.loads(vars[v])
3424 except json.JSONDecodeError:
3425 return json.dumps(vars[v])
3426 else:
3427 return vars[v]
3428
3429 if not strict:
3430 return f'"{v}"'
3431
3432 raise ValueError(f'Unknown value: {v}')
3433
3434 def create_map(mobj):
3435 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3436
3437 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3438 if not strict:
3439 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3440 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3441 code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code)
3442 code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code)
3443
3444 return re.sub(rf'''(?sx)
3445 {STRING_RE}|
3446 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3447 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3448 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3449 [0-9]+(?={SKIP_RE}:)|
3450 !+
3451 ''', fix_kv, code)
3452
3453
3454 def qualities(quality_ids):
3455 """ Get a numeric quality value out of a list of possible values """
3456 def q(qid):
3457 try:
3458 return quality_ids.index(qid)
3459 except ValueError:
3460 return -1
3461 return q
3462
3463
3464 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'video', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3465
3466
3467 DEFAULT_OUTTMPL = {
3468 'default': '%(title)s [%(id)s].%(ext)s',
3469 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3470 }
3471 OUTTMPL_TYPES = {
3472 'chapter': None,
3473 'subtitle': None,
3474 'thumbnail': None,
3475 'description': 'description',
3476 'annotation': 'annotations.xml',
3477 'infojson': 'info.json',
3478 'link': None,
3479 'pl_video': None,
3480 'pl_thumbnail': None,
3481 'pl_description': 'description',
3482 'pl_infojson': 'info.json',
3483 }
3484
3485 # As of [1] format syntax is:
3486 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3487 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3488 STR_FORMAT_RE_TMPL = r'''(?x)
3489 (?<!%)(?P<prefix>(?:%%)*)
3490 %
3491 (?P<has_key>\((?P<key>{0})\))?
3492 (?P<format>
3493 (?P<conversion>[#0\-+ ]+)?
3494 (?P<min_width>\d+)?
3495 (?P<precision>\.\d+)?
3496 (?P<len_mod>[hlL])? # unused in python
3497 {1} # conversion type
3498 )
3499 '''
3500
3501
3502 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3503
3504
3505 def limit_length(s, length):
3506 """ Add ellipses to overly long strings """
3507 if s is None:
3508 return None
3509 ELLIPSES = '...'
3510 if len(s) > length:
3511 return s[:length - len(ELLIPSES)] + ELLIPSES
3512 return s
3513
3514
3515 def version_tuple(v):
3516 return tuple(int(e) for e in re.split(r'[-.]', v))
3517
3518
3519 def is_outdated_version(version, limit, assume_new=True):
3520 if not version:
3521 return not assume_new
3522 try:
3523 return version_tuple(version) < version_tuple(limit)
3524 except ValueError:
3525 return not assume_new
3526
3527
3528 def ytdl_is_updateable():
3529 """ Returns if yt-dlp can be updated with -U """
3530
3531 from .update import is_non_updateable
3532
3533 return not is_non_updateable()
3534
3535
3536 def args_to_str(args):
3537 # Get a short string representation for a subprocess command
3538 return ' '.join(compat_shlex_quote(a) for a in args)
3539
3540
3541 def error_to_compat_str(err):
3542 return str(err)
3543
3544
3545 def error_to_str(err):
3546 return f'{type(err).__name__}: {err}'
3547
3548
3549 def mimetype2ext(mt, default=NO_DEFAULT):
3550 if not isinstance(mt, str):
3551 if default is not NO_DEFAULT:
3552 return default
3553 return None
3554
3555 MAP = {
3556 # video
3557 '3gpp': '3gp',
3558 'mp2t': 'ts',
3559 'mp4': 'mp4',
3560 'mpeg': 'mpeg',
3561 'mpegurl': 'm3u8',
3562 'quicktime': 'mov',
3563 'webm': 'webm',
3564 'vp9': 'vp9',
3565 'x-flv': 'flv',
3566 'x-m4v': 'm4v',
3567 'x-matroska': 'mkv',
3568 'x-mng': 'mng',
3569 'x-mp4-fragmented': 'mp4',
3570 'x-ms-asf': 'asf',
3571 'x-ms-wmv': 'wmv',
3572 'x-msvideo': 'avi',
3573
3574 # application (streaming playlists)
3575 'dash+xml': 'mpd',
3576 'f4m+xml': 'f4m',
3577 'hds+xml': 'f4m',
3578 'vnd.apple.mpegurl': 'm3u8',
3579 'vnd.ms-sstr+xml': 'ism',
3580 'x-mpegurl': 'm3u8',
3581
3582 # audio
3583 'audio/mp4': 'm4a',
3584 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3.
3585 # Using .mp3 as it's the most popular one
3586 'audio/mpeg': 'mp3',
3587 'audio/webm': 'webm',
3588 'audio/x-matroska': 'mka',
3589 'audio/x-mpegurl': 'm3u',
3590 'midi': 'mid',
3591 'ogg': 'ogg',
3592 'wav': 'wav',
3593 'wave': 'wav',
3594 'x-aac': 'aac',
3595 'x-flac': 'flac',
3596 'x-m4a': 'm4a',
3597 'x-realaudio': 'ra',
3598 'x-wav': 'wav',
3599
3600 # image
3601 'avif': 'avif',
3602 'bmp': 'bmp',
3603 'gif': 'gif',
3604 'jpeg': 'jpg',
3605 'png': 'png',
3606 'svg+xml': 'svg',
3607 'tiff': 'tif',
3608 'vnd.wap.wbmp': 'wbmp',
3609 'webp': 'webp',
3610 'x-icon': 'ico',
3611 'x-jng': 'jng',
3612 'x-ms-bmp': 'bmp',
3613
3614 # caption
3615 'filmstrip+json': 'fs',
3616 'smptett+xml': 'tt',
3617 'ttaf+xml': 'dfxp',
3618 'ttml+xml': 'ttml',
3619 'x-ms-sami': 'sami',
3620
3621 # misc
3622 'gzip': 'gz',
3623 'json': 'json',
3624 'xml': 'xml',
3625 'zip': 'zip',
3626 }
3627
3628 mimetype = mt.partition(';')[0].strip().lower()
3629 _, _, subtype = mimetype.rpartition('/')
3630
3631 ext = traverse_obj(MAP, mimetype, subtype, subtype.rsplit('+')[-1])
3632 if ext:
3633 return ext
3634 elif default is not NO_DEFAULT:
3635 return default
3636 return subtype.replace('+', '.')
3637
3638
3639 def ext2mimetype(ext_or_url):
3640 if not ext_or_url:
3641 return None
3642 if '.' not in ext_or_url:
3643 ext_or_url = f'file.{ext_or_url}'
3644 return mimetypes.guess_type(ext_or_url)[0]
3645
3646
3647 def parse_codecs(codecs_str):
3648 # http://tools.ietf.org/html/rfc6381
3649 if not codecs_str:
3650 return {}
3651 split_codecs = list(filter(None, map(
3652 str.strip, codecs_str.strip().strip(',').split(','))))
3653 vcodec, acodec, scodec, hdr = None, None, None, None
3654 for full_codec in split_codecs:
3655 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3656 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3657 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3658 if vcodec:
3659 continue
3660 vcodec = full_codec
3661 if parts[0] in ('dvh1', 'dvhe'):
3662 hdr = 'DV'
3663 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3664 hdr = 'HDR10'
3665 elif parts[:2] == ['vp9', '2']:
3666 hdr = 'HDR10'
3667 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-4',
3668 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3669 acodec = acodec or full_codec
3670 elif parts[0] in ('stpp', 'wvtt'):
3671 scodec = scodec or full_codec
3672 else:
3673 write_string(f'WARNING: Unknown codec {full_codec}\n')
3674 if vcodec or acodec or scodec:
3675 return {
3676 'vcodec': vcodec or 'none',
3677 'acodec': acodec or 'none',
3678 'dynamic_range': hdr,
3679 **({'scodec': scodec} if scodec is not None else {}),
3680 }
3681 elif len(split_codecs) == 2:
3682 return {
3683 'vcodec': split_codecs[0],
3684 'acodec': split_codecs[1],
3685 }
3686 return {}
3687
3688
3689 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3690 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3691
3692 allow_mkv = not preferences or 'mkv' in preferences
3693
3694 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3695 return 'mkv' # TODO: any other format allows this?
3696
3697 # TODO: All codecs supported by parse_codecs isn't handled here
3698 COMPATIBLE_CODECS = {
3699 'mp4': {
3700 'av1', 'hevc', 'avc1', 'mp4a', 'ac-4', # fourcc (m3u8, mpd)
3701 'h264', 'aacl', 'ec-3', # Set in ISM
3702 },
3703 'webm': {
3704 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3705 'vp9x', 'vp8x', # in the webm spec
3706 },
3707 }
3708
3709 sanitize_codec = functools.partial(
3710 try_get, getter=lambda x: x[0].split('.')[0].replace('0', '').lower())
3711 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3712
3713 for ext in preferences or COMPATIBLE_CODECS.keys():
3714 codec_set = COMPATIBLE_CODECS.get(ext, set())
3715 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3716 return ext
3717
3718 COMPATIBLE_EXTS = (
3719 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3720 {'webm', 'weba'},
3721 )
3722 for ext in preferences or vexts:
3723 current_exts = {ext, *vexts, *aexts}
3724 if ext == 'mkv' or current_exts == {ext} or any(
3725 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3726 return ext
3727 return 'mkv' if allow_mkv else preferences[-1]
3728
3729
3730 def urlhandle_detect_ext(url_handle, default=NO_DEFAULT):
3731 getheader = url_handle.headers.get
3732
3733 cd = getheader('Content-Disposition')
3734 if cd:
3735 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3736 if m:
3737 e = determine_ext(m.group('filename'), default_ext=None)
3738 if e:
3739 return e
3740
3741 meta_ext = getheader('x-amz-meta-name')
3742 if meta_ext:
3743 e = meta_ext.rpartition('.')[2]
3744 if e:
3745 return e
3746
3747 return mimetype2ext(getheader('Content-Type'), default=default)
3748
3749
3750 def encode_data_uri(data, mime_type):
3751 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3752
3753
3754 def age_restricted(content_limit, age_limit):
3755 """ Returns True iff the content should be blocked """
3756
3757 if age_limit is None: # No limit set
3758 return False
3759 if content_limit is None:
3760 return False # Content available for everyone
3761 return age_limit < content_limit
3762
3763
3764 # List of known byte-order-marks (BOM)
3765 BOMS = [
3766 (b'\xef\xbb\xbf', 'utf-8'),
3767 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3768 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3769 (b'\xff\xfe', 'utf-16-le'),
3770 (b'\xfe\xff', 'utf-16-be'),
3771 ]
3772
3773
3774 def is_html(first_bytes):
3775 """ Detect whether a file contains HTML by examining its first bytes. """
3776
3777 encoding = 'utf-8'
3778 for bom, enc in BOMS:
3779 while first_bytes.startswith(bom):
3780 encoding, first_bytes = enc, first_bytes[len(bom):]
3781
3782 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3783
3784
3785 def determine_protocol(info_dict):
3786 protocol = info_dict.get('protocol')
3787 if protocol is not None:
3788 return protocol
3789
3790 url = sanitize_url(info_dict['url'])
3791 if url.startswith('rtmp'):
3792 return 'rtmp'
3793 elif url.startswith('mms'):
3794 return 'mms'
3795 elif url.startswith('rtsp'):
3796 return 'rtsp'
3797
3798 ext = determine_ext(url)
3799 if ext == 'm3u8':
3800 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3801 elif ext == 'f4m':
3802 return 'f4m'
3803
3804 return urllib.parse.urlparse(url).scheme
3805
3806
3807 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3808 """ Render a list of rows, each as a list of values.
3809 Text after a \t will be right aligned """
3810 def width(string):
3811 return len(remove_terminal_sequences(string).replace('\t', ''))
3812
3813 def get_max_lens(table):
3814 return [max(width(str(v)) for v in col) for col in zip(*table)]
3815
3816 def filter_using_list(row, filterArray):
3817 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3818
3819 max_lens = get_max_lens(data) if hide_empty else []
3820 header_row = filter_using_list(header_row, max_lens)
3821 data = [filter_using_list(row, max_lens) for row in data]
3822
3823 table = [header_row] + data
3824 max_lens = get_max_lens(table)
3825 extra_gap += 1
3826 if delim:
3827 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3828 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3829 for row in table:
3830 for pos, text in enumerate(map(str, row)):
3831 if '\t' in text:
3832 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3833 else:
3834 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3835 ret = '\n'.join(''.join(row).rstrip() for row in table)
3836 return ret
3837
3838
3839 def _match_one(filter_part, dct, incomplete):
3840 # TODO: Generalize code with YoutubeDL._build_format_filter
3841 STRING_OPERATORS = {
3842 '*=': operator.contains,
3843 '^=': lambda attr, value: attr.startswith(value),
3844 '$=': lambda attr, value: attr.endswith(value),
3845 '~=': lambda attr, value: re.search(value, attr),
3846 }
3847 COMPARISON_OPERATORS = {
3848 **STRING_OPERATORS,
3849 '<=': operator.le, # "<=" must be defined above "<"
3850 '<': operator.lt,
3851 '>=': operator.ge,
3852 '>': operator.gt,
3853 '=': operator.eq,
3854 }
3855
3856 if isinstance(incomplete, bool):
3857 is_incomplete = lambda _: incomplete
3858 else:
3859 is_incomplete = lambda k: k in incomplete
3860
3861 operator_rex = re.compile(r'''(?x)
3862 (?P<key>[a-z_]+)
3863 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3864 (?:
3865 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3866 (?P<strval>.+?)
3867 )
3868 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3869 m = operator_rex.fullmatch(filter_part.strip())
3870 if m:
3871 m = m.groupdict()
3872 unnegated_op = COMPARISON_OPERATORS[m['op']]
3873 if m['negation']:
3874 op = lambda attr, value: not unnegated_op(attr, value)
3875 else:
3876 op = unnegated_op
3877 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3878 if m['quote']:
3879 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3880 actual_value = dct.get(m['key'])
3881 numeric_comparison = None
3882 if isinstance(actual_value, (int, float)):
3883 # If the original field is a string and matching comparisonvalue is
3884 # a number we should respect the origin of the original field
3885 # and process comparison value as a string (see
3886 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3887 try:
3888 numeric_comparison = int(comparison_value)
3889 except ValueError:
3890 numeric_comparison = parse_filesize(comparison_value)
3891 if numeric_comparison is None:
3892 numeric_comparison = parse_filesize(f'{comparison_value}B')
3893 if numeric_comparison is None:
3894 numeric_comparison = parse_duration(comparison_value)
3895 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3896 raise ValueError('Operator %s only supports string values!' % m['op'])
3897 if actual_value is None:
3898 return is_incomplete(m['key']) or m['none_inclusive']
3899 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3900
3901 UNARY_OPERATORS = {
3902 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3903 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3904 }
3905 operator_rex = re.compile(r'''(?x)
3906 (?P<op>%s)\s*(?P<key>[a-z_]+)
3907 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3908 m = operator_rex.fullmatch(filter_part.strip())
3909 if m:
3910 op = UNARY_OPERATORS[m.group('op')]
3911 actual_value = dct.get(m.group('key'))
3912 if is_incomplete(m.group('key')) and actual_value is None:
3913 return True
3914 return op(actual_value)
3915
3916 raise ValueError('Invalid filter part %r' % filter_part)
3917
3918
3919 def match_str(filter_str, dct, incomplete=False):
3920 """ Filter a dictionary with a simple string syntax.
3921 @returns Whether the filter passes
3922 @param incomplete Set of keys that is expected to be missing from dct.
3923 Can be True/False to indicate all/none of the keys may be missing.
3924 All conditions on incomplete keys pass if the key is missing
3925 """
3926 return all(
3927 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3928 for filter_part in re.split(r'(?<!\\)&', filter_str))
3929
3930
3931 def match_filter_func(filters, breaking_filters=None):
3932 if not filters and not breaking_filters:
3933 return None
3934 breaking_filters = match_filter_func(breaking_filters) or (lambda _, __: None)
3935 filters = set(variadic(filters or []))
3936
3937 interactive = '-' in filters
3938 if interactive:
3939 filters.remove('-')
3940
3941 def _match_func(info_dict, incomplete=False):
3942 ret = breaking_filters(info_dict, incomplete)
3943 if ret is not None:
3944 raise RejectedVideoReached(ret)
3945
3946 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3947 return NO_DEFAULT if interactive and not incomplete else None
3948 else:
3949 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3950 filter_str = ') | ('.join(map(str.strip, filters))
3951 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3952 return _match_func
3953
3954
3955 class download_range_func:
3956 def __init__(self, chapters, ranges):
3957 self.chapters, self.ranges = chapters, ranges
3958
3959 def __call__(self, info_dict, ydl):
3960 if not self.ranges and not self.chapters:
3961 yield {}
3962
3963 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3964 else 'Cannot match chapters since chapter information is unavailable')
3965 for regex in self.chapters or []:
3966 for i, chapter in enumerate(info_dict.get('chapters') or []):
3967 if re.search(regex, chapter['title']):
3968 warning = None
3969 yield {**chapter, 'index': i}
3970 if self.chapters and warning:
3971 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3972
3973 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3974
3975 def __eq__(self, other):
3976 return (isinstance(other, download_range_func)
3977 and self.chapters == other.chapters and self.ranges == other.ranges)
3978
3979 def __repr__(self):
3980 return f'{__name__}.{type(self).__name__}({self.chapters}, {self.ranges})'
3981
3982
3983 def parse_dfxp_time_expr(time_expr):
3984 if not time_expr:
3985 return
3986
3987 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3988 if mobj:
3989 return float(mobj.group('time_offset'))
3990
3991 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3992 if mobj:
3993 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3994
3995
3996 def srt_subtitles_timecode(seconds):
3997 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3998
3999
4000 def ass_subtitles_timecode(seconds):
4001 time = timetuple_from_msec(seconds * 1000)
4002 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
4003
4004
4005 def dfxp2srt(dfxp_data):
4006 '''
4007 @param dfxp_data A bytes-like object containing DFXP data
4008 @returns A unicode object containing converted SRT data
4009 '''
4010 LEGACY_NAMESPACES = (
4011 (b'http://www.w3.org/ns/ttml', [
4012 b'http://www.w3.org/2004/11/ttaf1',
4013 b'http://www.w3.org/2006/04/ttaf1',
4014 b'http://www.w3.org/2006/10/ttaf1',
4015 ]),
4016 (b'http://www.w3.org/ns/ttml#styling', [
4017 b'http://www.w3.org/ns/ttml#style',
4018 ]),
4019 )
4020
4021 SUPPORTED_STYLING = [
4022 'color',
4023 'fontFamily',
4024 'fontSize',
4025 'fontStyle',
4026 'fontWeight',
4027 'textDecoration'
4028 ]
4029
4030 _x = functools.partial(xpath_with_ns, ns_map={
4031 'xml': 'http://www.w3.org/XML/1998/namespace',
4032 'ttml': 'http://www.w3.org/ns/ttml',
4033 'tts': 'http://www.w3.org/ns/ttml#styling',
4034 })
4035
4036 styles = {}
4037 default_style = {}
4038
4039 class TTMLPElementParser:
4040 _out = ''
4041 _unclosed_elements = []
4042 _applied_styles = []
4043
4044 def start(self, tag, attrib):
4045 if tag in (_x('ttml:br'), 'br'):
4046 self._out += '\n'
4047 else:
4048 unclosed_elements = []
4049 style = {}
4050 element_style_id = attrib.get('style')
4051 if default_style:
4052 style.update(default_style)
4053 if element_style_id:
4054 style.update(styles.get(element_style_id, {}))
4055 for prop in SUPPORTED_STYLING:
4056 prop_val = attrib.get(_x('tts:' + prop))
4057 if prop_val:
4058 style[prop] = prop_val
4059 if style:
4060 font = ''
4061 for k, v in sorted(style.items()):
4062 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4063 continue
4064 if k == 'color':
4065 font += ' color="%s"' % v
4066 elif k == 'fontSize':
4067 font += ' size="%s"' % v
4068 elif k == 'fontFamily':
4069 font += ' face="%s"' % v
4070 elif k == 'fontWeight' and v == 'bold':
4071 self._out += '<b>'
4072 unclosed_elements.append('b')
4073 elif k == 'fontStyle' and v == 'italic':
4074 self._out += '<i>'
4075 unclosed_elements.append('i')
4076 elif k == 'textDecoration' and v == 'underline':
4077 self._out += '<u>'
4078 unclosed_elements.append('u')
4079 if font:
4080 self._out += '<font' + font + '>'
4081 unclosed_elements.append('font')
4082 applied_style = {}
4083 if self._applied_styles:
4084 applied_style.update(self._applied_styles[-1])
4085 applied_style.update(style)
4086 self._applied_styles.append(applied_style)
4087 self._unclosed_elements.append(unclosed_elements)
4088
4089 def end(self, tag):
4090 if tag not in (_x('ttml:br'), 'br'):
4091 unclosed_elements = self._unclosed_elements.pop()
4092 for element in reversed(unclosed_elements):
4093 self._out += '</%s>' % element
4094 if unclosed_elements and self._applied_styles:
4095 self._applied_styles.pop()
4096
4097 def data(self, data):
4098 self._out += data
4099
4100 def close(self):
4101 return self._out.strip()
4102
4103 # Fix UTF-8 encoded file wrongly marked as UTF-16. See https://github.com/yt-dlp/yt-dlp/issues/6543#issuecomment-1477169870
4104 # This will not trigger false positives since only UTF-8 text is being replaced
4105 dfxp_data = dfxp_data.replace(b'encoding=\'UTF-16\'', b'encoding=\'UTF-8\'')
4106
4107 def parse_node(node):
4108 target = TTMLPElementParser()
4109 parser = xml.etree.ElementTree.XMLParser(target=target)
4110 parser.feed(xml.etree.ElementTree.tostring(node))
4111 return parser.close()
4112
4113 for k, v in LEGACY_NAMESPACES:
4114 for ns in v:
4115 dfxp_data = dfxp_data.replace(ns, k)
4116
4117 dfxp = compat_etree_fromstring(dfxp_data)
4118 out = []
4119 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4120
4121 if not paras:
4122 raise ValueError('Invalid dfxp/TTML subtitle')
4123
4124 repeat = False
4125 while True:
4126 for style in dfxp.findall(_x('.//ttml:style')):
4127 style_id = style.get('id') or style.get(_x('xml:id'))
4128 if not style_id:
4129 continue
4130 parent_style_id = style.get('style')
4131 if parent_style_id:
4132 if parent_style_id not in styles:
4133 repeat = True
4134 continue
4135 styles[style_id] = styles[parent_style_id].copy()
4136 for prop in SUPPORTED_STYLING:
4137 prop_val = style.get(_x('tts:' + prop))
4138 if prop_val:
4139 styles.setdefault(style_id, {})[prop] = prop_val
4140 if repeat:
4141 repeat = False
4142 else:
4143 break
4144
4145 for p in ('body', 'div'):
4146 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4147 if ele is None:
4148 continue
4149 style = styles.get(ele.get('style'))
4150 if not style:
4151 continue
4152 default_style.update(style)
4153
4154 for para, index in zip(paras, itertools.count(1)):
4155 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4156 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4157 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4158 if begin_time is None:
4159 continue
4160 if not end_time:
4161 if not dur:
4162 continue
4163 end_time = begin_time + dur
4164 out.append('%d\n%s --> %s\n%s\n\n' % (
4165 index,
4166 srt_subtitles_timecode(begin_time),
4167 srt_subtitles_timecode(end_time),
4168 parse_node(para)))
4169
4170 return ''.join(out)
4171
4172
4173 def cli_option(params, command_option, param, separator=None):
4174 param = params.get(param)
4175 return ([] if param is None
4176 else [command_option, str(param)] if separator is None
4177 else [f'{command_option}{separator}{param}'])
4178
4179
4180 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4181 param = params.get(param)
4182 assert param in (True, False, None)
4183 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4184
4185
4186 def cli_valueless_option(params, command_option, param, expected_value=True):
4187 return [command_option] if params.get(param) == expected_value else []
4188
4189
4190 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4191 if isinstance(argdict, (list, tuple)): # for backward compatibility
4192 if use_compat:
4193 return argdict
4194 else:
4195 argdict = None
4196 if argdict is None:
4197 return default
4198 assert isinstance(argdict, dict)
4199
4200 assert isinstance(keys, (list, tuple))
4201 for key_list in keys:
4202 arg_list = list(filter(
4203 lambda x: x is not None,
4204 [argdict.get(key.lower()) for key in variadic(key_list)]))
4205 if arg_list:
4206 return [arg for args in arg_list for arg in args]
4207 return default
4208
4209
4210 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4211 main_key, exe = main_key.lower(), exe.lower()
4212 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4213 keys = [f'{root_key}{k}' for k in (keys or [''])]
4214 if root_key in keys:
4215 if main_key != exe:
4216 keys.append((main_key, exe))
4217 keys.append('default')
4218 else:
4219 use_compat = False
4220 return cli_configuration_args(argdict, keys, default, use_compat)
4221
4222
4223 class ISO639Utils:
4224 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4225 _lang_map = {
4226 'aa': 'aar',
4227 'ab': 'abk',
4228 'ae': 'ave',
4229 'af': 'afr',
4230 'ak': 'aka',
4231 'am': 'amh',
4232 'an': 'arg',
4233 'ar': 'ara',
4234 'as': 'asm',
4235 'av': 'ava',
4236 'ay': 'aym',
4237 'az': 'aze',
4238 'ba': 'bak',
4239 'be': 'bel',
4240 'bg': 'bul',
4241 'bh': 'bih',
4242 'bi': 'bis',
4243 'bm': 'bam',
4244 'bn': 'ben',
4245 'bo': 'bod',
4246 'br': 'bre',
4247 'bs': 'bos',
4248 'ca': 'cat',
4249 'ce': 'che',
4250 'ch': 'cha',
4251 'co': 'cos',
4252 'cr': 'cre',
4253 'cs': 'ces',
4254 'cu': 'chu',
4255 'cv': 'chv',
4256 'cy': 'cym',
4257 'da': 'dan',
4258 'de': 'deu',
4259 'dv': 'div',
4260 'dz': 'dzo',
4261 'ee': 'ewe',
4262 'el': 'ell',
4263 'en': 'eng',
4264 'eo': 'epo',
4265 'es': 'spa',
4266 'et': 'est',
4267 'eu': 'eus',
4268 'fa': 'fas',
4269 'ff': 'ful',
4270 'fi': 'fin',
4271 'fj': 'fij',
4272 'fo': 'fao',
4273 'fr': 'fra',
4274 'fy': 'fry',
4275 'ga': 'gle',
4276 'gd': 'gla',
4277 'gl': 'glg',
4278 'gn': 'grn',
4279 'gu': 'guj',
4280 'gv': 'glv',
4281 'ha': 'hau',
4282 'he': 'heb',
4283 'iw': 'heb', # Replaced by he in 1989 revision
4284 'hi': 'hin',
4285 'ho': 'hmo',
4286 'hr': 'hrv',
4287 'ht': 'hat',
4288 'hu': 'hun',
4289 'hy': 'hye',
4290 'hz': 'her',
4291 'ia': 'ina',
4292 'id': 'ind',
4293 'in': 'ind', # Replaced by id in 1989 revision
4294 'ie': 'ile',
4295 'ig': 'ibo',
4296 'ii': 'iii',
4297 'ik': 'ipk',
4298 'io': 'ido',
4299 'is': 'isl',
4300 'it': 'ita',
4301 'iu': 'iku',
4302 'ja': 'jpn',
4303 'jv': 'jav',
4304 'ka': 'kat',
4305 'kg': 'kon',
4306 'ki': 'kik',
4307 'kj': 'kua',
4308 'kk': 'kaz',
4309 'kl': 'kal',
4310 'km': 'khm',
4311 'kn': 'kan',
4312 'ko': 'kor',
4313 'kr': 'kau',
4314 'ks': 'kas',
4315 'ku': 'kur',
4316 'kv': 'kom',
4317 'kw': 'cor',
4318 'ky': 'kir',
4319 'la': 'lat',
4320 'lb': 'ltz',
4321 'lg': 'lug',
4322 'li': 'lim',
4323 'ln': 'lin',
4324 'lo': 'lao',
4325 'lt': 'lit',
4326 'lu': 'lub',
4327 'lv': 'lav',
4328 'mg': 'mlg',
4329 'mh': 'mah',
4330 'mi': 'mri',
4331 'mk': 'mkd',
4332 'ml': 'mal',
4333 'mn': 'mon',
4334 'mr': 'mar',
4335 'ms': 'msa',
4336 'mt': 'mlt',
4337 'my': 'mya',
4338 'na': 'nau',
4339 'nb': 'nob',
4340 'nd': 'nde',
4341 'ne': 'nep',
4342 'ng': 'ndo',
4343 'nl': 'nld',
4344 'nn': 'nno',
4345 'no': 'nor',
4346 'nr': 'nbl',
4347 'nv': 'nav',
4348 'ny': 'nya',
4349 'oc': 'oci',
4350 'oj': 'oji',
4351 'om': 'orm',
4352 'or': 'ori',
4353 'os': 'oss',
4354 'pa': 'pan',
4355 'pi': 'pli',
4356 'pl': 'pol',
4357 'ps': 'pus',
4358 'pt': 'por',
4359 'qu': 'que',
4360 'rm': 'roh',
4361 'rn': 'run',
4362 'ro': 'ron',
4363 'ru': 'rus',
4364 'rw': 'kin',
4365 'sa': 'san',
4366 'sc': 'srd',
4367 'sd': 'snd',
4368 'se': 'sme',
4369 'sg': 'sag',
4370 'si': 'sin',
4371 'sk': 'slk',
4372 'sl': 'slv',
4373 'sm': 'smo',
4374 'sn': 'sna',
4375 'so': 'som',
4376 'sq': 'sqi',
4377 'sr': 'srp',
4378 'ss': 'ssw',
4379 'st': 'sot',
4380 'su': 'sun',
4381 'sv': 'swe',
4382 'sw': 'swa',
4383 'ta': 'tam',
4384 'te': 'tel',
4385 'tg': 'tgk',
4386 'th': 'tha',
4387 'ti': 'tir',
4388 'tk': 'tuk',
4389 'tl': 'tgl',
4390 'tn': 'tsn',
4391 'to': 'ton',
4392 'tr': 'tur',
4393 'ts': 'tso',
4394 'tt': 'tat',
4395 'tw': 'twi',
4396 'ty': 'tah',
4397 'ug': 'uig',
4398 'uk': 'ukr',
4399 'ur': 'urd',
4400 'uz': 'uzb',
4401 've': 'ven',
4402 'vi': 'vie',
4403 'vo': 'vol',
4404 'wa': 'wln',
4405 'wo': 'wol',
4406 'xh': 'xho',
4407 'yi': 'yid',
4408 'ji': 'yid', # Replaced by yi in 1989 revision
4409 'yo': 'yor',
4410 'za': 'zha',
4411 'zh': 'zho',
4412 'zu': 'zul',
4413 }
4414
4415 @classmethod
4416 def short2long(cls, code):
4417 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4418 return cls._lang_map.get(code[:2])
4419
4420 @classmethod
4421 def long2short(cls, code):
4422 """Convert language code from ISO 639-2/T to ISO 639-1"""
4423 for short_name, long_name in cls._lang_map.items():
4424 if long_name == code:
4425 return short_name
4426
4427
4428 class ISO3166Utils:
4429 # From http://data.okfn.org/data/core/country-list
4430 _country_map = {
4431 'AF': 'Afghanistan',
4432 'AX': 'Åland Islands',
4433 'AL': 'Albania',
4434 'DZ': 'Algeria',
4435 'AS': 'American Samoa',
4436 'AD': 'Andorra',
4437 'AO': 'Angola',
4438 'AI': 'Anguilla',
4439 'AQ': 'Antarctica',
4440 'AG': 'Antigua and Barbuda',
4441 'AR': 'Argentina',
4442 'AM': 'Armenia',
4443 'AW': 'Aruba',
4444 'AU': 'Australia',
4445 'AT': 'Austria',
4446 'AZ': 'Azerbaijan',
4447 'BS': 'Bahamas',
4448 'BH': 'Bahrain',
4449 'BD': 'Bangladesh',
4450 'BB': 'Barbados',
4451 'BY': 'Belarus',
4452 'BE': 'Belgium',
4453 'BZ': 'Belize',
4454 'BJ': 'Benin',
4455 'BM': 'Bermuda',
4456 'BT': 'Bhutan',
4457 'BO': 'Bolivia, Plurinational State of',
4458 'BQ': 'Bonaire, Sint Eustatius and Saba',
4459 'BA': 'Bosnia and Herzegovina',
4460 'BW': 'Botswana',
4461 'BV': 'Bouvet Island',
4462 'BR': 'Brazil',
4463 'IO': 'British Indian Ocean Territory',
4464 'BN': 'Brunei Darussalam',
4465 'BG': 'Bulgaria',
4466 'BF': 'Burkina Faso',
4467 'BI': 'Burundi',
4468 'KH': 'Cambodia',
4469 'CM': 'Cameroon',
4470 'CA': 'Canada',
4471 'CV': 'Cape Verde',
4472 'KY': 'Cayman Islands',
4473 'CF': 'Central African Republic',
4474 'TD': 'Chad',
4475 'CL': 'Chile',
4476 'CN': 'China',
4477 'CX': 'Christmas Island',
4478 'CC': 'Cocos (Keeling) Islands',
4479 'CO': 'Colombia',
4480 'KM': 'Comoros',
4481 'CG': 'Congo',
4482 'CD': 'Congo, the Democratic Republic of the',
4483 'CK': 'Cook Islands',
4484 'CR': 'Costa Rica',
4485 'CI': 'Côte d\'Ivoire',
4486 'HR': 'Croatia',
4487 'CU': 'Cuba',
4488 'CW': 'Curaçao',
4489 'CY': 'Cyprus',
4490 'CZ': 'Czech Republic',
4491 'DK': 'Denmark',
4492 'DJ': 'Djibouti',
4493 'DM': 'Dominica',
4494 'DO': 'Dominican Republic',
4495 'EC': 'Ecuador',
4496 'EG': 'Egypt',
4497 'SV': 'El Salvador',
4498 'GQ': 'Equatorial Guinea',
4499 'ER': 'Eritrea',
4500 'EE': 'Estonia',
4501 'ET': 'Ethiopia',
4502 'FK': 'Falkland Islands (Malvinas)',
4503 'FO': 'Faroe Islands',
4504 'FJ': 'Fiji',
4505 'FI': 'Finland',
4506 'FR': 'France',
4507 'GF': 'French Guiana',
4508 'PF': 'French Polynesia',
4509 'TF': 'French Southern Territories',
4510 'GA': 'Gabon',
4511 'GM': 'Gambia',
4512 'GE': 'Georgia',
4513 'DE': 'Germany',
4514 'GH': 'Ghana',
4515 'GI': 'Gibraltar',
4516 'GR': 'Greece',
4517 'GL': 'Greenland',
4518 'GD': 'Grenada',
4519 'GP': 'Guadeloupe',
4520 'GU': 'Guam',
4521 'GT': 'Guatemala',
4522 'GG': 'Guernsey',
4523 'GN': 'Guinea',
4524 'GW': 'Guinea-Bissau',
4525 'GY': 'Guyana',
4526 'HT': 'Haiti',
4527 'HM': 'Heard Island and McDonald Islands',
4528 'VA': 'Holy See (Vatican City State)',
4529 'HN': 'Honduras',
4530 'HK': 'Hong Kong',
4531 'HU': 'Hungary',
4532 'IS': 'Iceland',
4533 'IN': 'India',
4534 'ID': 'Indonesia',
4535 'IR': 'Iran, Islamic Republic of',
4536 'IQ': 'Iraq',
4537 'IE': 'Ireland',
4538 'IM': 'Isle of Man',
4539 'IL': 'Israel',
4540 'IT': 'Italy',
4541 'JM': 'Jamaica',
4542 'JP': 'Japan',
4543 'JE': 'Jersey',
4544 'JO': 'Jordan',
4545 'KZ': 'Kazakhstan',
4546 'KE': 'Kenya',
4547 'KI': 'Kiribati',
4548 'KP': 'Korea, Democratic People\'s Republic of',
4549 'KR': 'Korea, Republic of',
4550 'KW': 'Kuwait',
4551 'KG': 'Kyrgyzstan',
4552 'LA': 'Lao People\'s Democratic Republic',
4553 'LV': 'Latvia',
4554 'LB': 'Lebanon',
4555 'LS': 'Lesotho',
4556 'LR': 'Liberia',
4557 'LY': 'Libya',
4558 'LI': 'Liechtenstein',
4559 'LT': 'Lithuania',
4560 'LU': 'Luxembourg',
4561 'MO': 'Macao',
4562 'MK': 'Macedonia, the Former Yugoslav Republic of',
4563 'MG': 'Madagascar',
4564 'MW': 'Malawi',
4565 'MY': 'Malaysia',
4566 'MV': 'Maldives',
4567 'ML': 'Mali',
4568 'MT': 'Malta',
4569 'MH': 'Marshall Islands',
4570 'MQ': 'Martinique',
4571 'MR': 'Mauritania',
4572 'MU': 'Mauritius',
4573 'YT': 'Mayotte',
4574 'MX': 'Mexico',
4575 'FM': 'Micronesia, Federated States of',
4576 'MD': 'Moldova, Republic of',
4577 'MC': 'Monaco',
4578 'MN': 'Mongolia',
4579 'ME': 'Montenegro',
4580 'MS': 'Montserrat',
4581 'MA': 'Morocco',
4582 'MZ': 'Mozambique',
4583 'MM': 'Myanmar',
4584 'NA': 'Namibia',
4585 'NR': 'Nauru',
4586 'NP': 'Nepal',
4587 'NL': 'Netherlands',
4588 'NC': 'New Caledonia',
4589 'NZ': 'New Zealand',
4590 'NI': 'Nicaragua',
4591 'NE': 'Niger',
4592 'NG': 'Nigeria',
4593 'NU': 'Niue',
4594 'NF': 'Norfolk Island',
4595 'MP': 'Northern Mariana Islands',
4596 'NO': 'Norway',
4597 'OM': 'Oman',
4598 'PK': 'Pakistan',
4599 'PW': 'Palau',
4600 'PS': 'Palestine, State of',
4601 'PA': 'Panama',
4602 'PG': 'Papua New Guinea',
4603 'PY': 'Paraguay',
4604 'PE': 'Peru',
4605 'PH': 'Philippines',
4606 'PN': 'Pitcairn',
4607 'PL': 'Poland',
4608 'PT': 'Portugal',
4609 'PR': 'Puerto Rico',
4610 'QA': 'Qatar',
4611 'RE': 'Réunion',
4612 'RO': 'Romania',
4613 'RU': 'Russian Federation',
4614 'RW': 'Rwanda',
4615 'BL': 'Saint Barthélemy',
4616 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4617 'KN': 'Saint Kitts and Nevis',
4618 'LC': 'Saint Lucia',
4619 'MF': 'Saint Martin (French part)',
4620 'PM': 'Saint Pierre and Miquelon',
4621 'VC': 'Saint Vincent and the Grenadines',
4622 'WS': 'Samoa',
4623 'SM': 'San Marino',
4624 'ST': 'Sao Tome and Principe',
4625 'SA': 'Saudi Arabia',
4626 'SN': 'Senegal',
4627 'RS': 'Serbia',
4628 'SC': 'Seychelles',
4629 'SL': 'Sierra Leone',
4630 'SG': 'Singapore',
4631 'SX': 'Sint Maarten (Dutch part)',
4632 'SK': 'Slovakia',
4633 'SI': 'Slovenia',
4634 'SB': 'Solomon Islands',
4635 'SO': 'Somalia',
4636 'ZA': 'South Africa',
4637 'GS': 'South Georgia and the South Sandwich Islands',
4638 'SS': 'South Sudan',
4639 'ES': 'Spain',
4640 'LK': 'Sri Lanka',
4641 'SD': 'Sudan',
4642 'SR': 'Suriname',
4643 'SJ': 'Svalbard and Jan Mayen',
4644 'SZ': 'Swaziland',
4645 'SE': 'Sweden',
4646 'CH': 'Switzerland',
4647 'SY': 'Syrian Arab Republic',
4648 'TW': 'Taiwan, Province of China',
4649 'TJ': 'Tajikistan',
4650 'TZ': 'Tanzania, United Republic of',
4651 'TH': 'Thailand',
4652 'TL': 'Timor-Leste',
4653 'TG': 'Togo',
4654 'TK': 'Tokelau',
4655 'TO': 'Tonga',
4656 'TT': 'Trinidad and Tobago',
4657 'TN': 'Tunisia',
4658 'TR': 'Turkey',
4659 'TM': 'Turkmenistan',
4660 'TC': 'Turks and Caicos Islands',
4661 'TV': 'Tuvalu',
4662 'UG': 'Uganda',
4663 'UA': 'Ukraine',
4664 'AE': 'United Arab Emirates',
4665 'GB': 'United Kingdom',
4666 'US': 'United States',
4667 'UM': 'United States Minor Outlying Islands',
4668 'UY': 'Uruguay',
4669 'UZ': 'Uzbekistan',
4670 'VU': 'Vanuatu',
4671 'VE': 'Venezuela, Bolivarian Republic of',
4672 'VN': 'Viet Nam',
4673 'VG': 'Virgin Islands, British',
4674 'VI': 'Virgin Islands, U.S.',
4675 'WF': 'Wallis and Futuna',
4676 'EH': 'Western Sahara',
4677 'YE': 'Yemen',
4678 'ZM': 'Zambia',
4679 'ZW': 'Zimbabwe',
4680 # Not ISO 3166 codes, but used for IP blocks
4681 'AP': 'Asia/Pacific Region',
4682 'EU': 'Europe',
4683 }
4684
4685 @classmethod
4686 def short2full(cls, code):
4687 """Convert an ISO 3166-2 country code to the corresponding full name"""
4688 return cls._country_map.get(code.upper())
4689
4690
4691 class GeoUtils:
4692 # Major IPv4 address blocks per country
4693 _country_ip_map = {
4694 'AD': '46.172.224.0/19',
4695 'AE': '94.200.0.0/13',
4696 'AF': '149.54.0.0/17',
4697 'AG': '209.59.64.0/18',
4698 'AI': '204.14.248.0/21',
4699 'AL': '46.99.0.0/16',
4700 'AM': '46.70.0.0/15',
4701 'AO': '105.168.0.0/13',
4702 'AP': '182.50.184.0/21',
4703 'AQ': '23.154.160.0/24',
4704 'AR': '181.0.0.0/12',
4705 'AS': '202.70.112.0/20',
4706 'AT': '77.116.0.0/14',
4707 'AU': '1.128.0.0/11',
4708 'AW': '181.41.0.0/18',
4709 'AX': '185.217.4.0/22',
4710 'AZ': '5.197.0.0/16',
4711 'BA': '31.176.128.0/17',
4712 'BB': '65.48.128.0/17',
4713 'BD': '114.130.0.0/16',
4714 'BE': '57.0.0.0/8',
4715 'BF': '102.178.0.0/15',
4716 'BG': '95.42.0.0/15',
4717 'BH': '37.131.0.0/17',
4718 'BI': '154.117.192.0/18',
4719 'BJ': '137.255.0.0/16',
4720 'BL': '185.212.72.0/23',
4721 'BM': '196.12.64.0/18',
4722 'BN': '156.31.0.0/16',
4723 'BO': '161.56.0.0/16',
4724 'BQ': '161.0.80.0/20',
4725 'BR': '191.128.0.0/12',
4726 'BS': '24.51.64.0/18',
4727 'BT': '119.2.96.0/19',
4728 'BW': '168.167.0.0/16',
4729 'BY': '178.120.0.0/13',
4730 'BZ': '179.42.192.0/18',
4731 'CA': '99.224.0.0/11',
4732 'CD': '41.243.0.0/16',
4733 'CF': '197.242.176.0/21',
4734 'CG': '160.113.0.0/16',
4735 'CH': '85.0.0.0/13',
4736 'CI': '102.136.0.0/14',
4737 'CK': '202.65.32.0/19',
4738 'CL': '152.172.0.0/14',
4739 'CM': '102.244.0.0/14',
4740 'CN': '36.128.0.0/10',
4741 'CO': '181.240.0.0/12',
4742 'CR': '201.192.0.0/12',
4743 'CU': '152.206.0.0/15',
4744 'CV': '165.90.96.0/19',
4745 'CW': '190.88.128.0/17',
4746 'CY': '31.153.0.0/16',
4747 'CZ': '88.100.0.0/14',
4748 'DE': '53.0.0.0/8',
4749 'DJ': '197.241.0.0/17',
4750 'DK': '87.48.0.0/12',
4751 'DM': '192.243.48.0/20',
4752 'DO': '152.166.0.0/15',
4753 'DZ': '41.96.0.0/12',
4754 'EC': '186.68.0.0/15',
4755 'EE': '90.190.0.0/15',
4756 'EG': '156.160.0.0/11',
4757 'ER': '196.200.96.0/20',
4758 'ES': '88.0.0.0/11',
4759 'ET': '196.188.0.0/14',
4760 'EU': '2.16.0.0/13',
4761 'FI': '91.152.0.0/13',
4762 'FJ': '144.120.0.0/16',
4763 'FK': '80.73.208.0/21',
4764 'FM': '119.252.112.0/20',
4765 'FO': '88.85.32.0/19',
4766 'FR': '90.0.0.0/9',
4767 'GA': '41.158.0.0/15',
4768 'GB': '25.0.0.0/8',
4769 'GD': '74.122.88.0/21',
4770 'GE': '31.146.0.0/16',
4771 'GF': '161.22.64.0/18',
4772 'GG': '62.68.160.0/19',
4773 'GH': '154.160.0.0/12',
4774 'GI': '95.164.0.0/16',
4775 'GL': '88.83.0.0/19',
4776 'GM': '160.182.0.0/15',
4777 'GN': '197.149.192.0/18',
4778 'GP': '104.250.0.0/19',
4779 'GQ': '105.235.224.0/20',
4780 'GR': '94.64.0.0/13',
4781 'GT': '168.234.0.0/16',
4782 'GU': '168.123.0.0/16',
4783 'GW': '197.214.80.0/20',
4784 'GY': '181.41.64.0/18',
4785 'HK': '113.252.0.0/14',
4786 'HN': '181.210.0.0/16',
4787 'HR': '93.136.0.0/13',
4788 'HT': '148.102.128.0/17',
4789 'HU': '84.0.0.0/14',
4790 'ID': '39.192.0.0/10',
4791 'IE': '87.32.0.0/12',
4792 'IL': '79.176.0.0/13',
4793 'IM': '5.62.80.0/20',
4794 'IN': '117.192.0.0/10',
4795 'IO': '203.83.48.0/21',
4796 'IQ': '37.236.0.0/14',
4797 'IR': '2.176.0.0/12',
4798 'IS': '82.221.0.0/16',
4799 'IT': '79.0.0.0/10',
4800 'JE': '87.244.64.0/18',
4801 'JM': '72.27.0.0/17',
4802 'JO': '176.29.0.0/16',
4803 'JP': '133.0.0.0/8',
4804 'KE': '105.48.0.0/12',
4805 'KG': '158.181.128.0/17',
4806 'KH': '36.37.128.0/17',
4807 'KI': '103.25.140.0/22',
4808 'KM': '197.255.224.0/20',
4809 'KN': '198.167.192.0/19',
4810 'KP': '175.45.176.0/22',
4811 'KR': '175.192.0.0/10',
4812 'KW': '37.36.0.0/14',
4813 'KY': '64.96.0.0/15',
4814 'KZ': '2.72.0.0/13',
4815 'LA': '115.84.64.0/18',
4816 'LB': '178.135.0.0/16',
4817 'LC': '24.92.144.0/20',
4818 'LI': '82.117.0.0/19',
4819 'LK': '112.134.0.0/15',
4820 'LR': '102.183.0.0/16',
4821 'LS': '129.232.0.0/17',
4822 'LT': '78.56.0.0/13',
4823 'LU': '188.42.0.0/16',
4824 'LV': '46.109.0.0/16',
4825 'LY': '41.252.0.0/14',
4826 'MA': '105.128.0.0/11',
4827 'MC': '88.209.64.0/18',
4828 'MD': '37.246.0.0/16',
4829 'ME': '178.175.0.0/17',
4830 'MF': '74.112.232.0/21',
4831 'MG': '154.126.0.0/17',
4832 'MH': '117.103.88.0/21',
4833 'MK': '77.28.0.0/15',
4834 'ML': '154.118.128.0/18',
4835 'MM': '37.111.0.0/17',
4836 'MN': '49.0.128.0/17',
4837 'MO': '60.246.0.0/16',
4838 'MP': '202.88.64.0/20',
4839 'MQ': '109.203.224.0/19',
4840 'MR': '41.188.64.0/18',
4841 'MS': '208.90.112.0/22',
4842 'MT': '46.11.0.0/16',
4843 'MU': '105.16.0.0/12',
4844 'MV': '27.114.128.0/18',
4845 'MW': '102.70.0.0/15',
4846 'MX': '187.192.0.0/11',
4847 'MY': '175.136.0.0/13',
4848 'MZ': '197.218.0.0/15',
4849 'NA': '41.182.0.0/16',
4850 'NC': '101.101.0.0/18',
4851 'NE': '197.214.0.0/18',
4852 'NF': '203.17.240.0/22',
4853 'NG': '105.112.0.0/12',
4854 'NI': '186.76.0.0/15',
4855 'NL': '145.96.0.0/11',
4856 'NO': '84.208.0.0/13',
4857 'NP': '36.252.0.0/15',
4858 'NR': '203.98.224.0/19',
4859 'NU': '49.156.48.0/22',
4860 'NZ': '49.224.0.0/14',
4861 'OM': '5.36.0.0/15',
4862 'PA': '186.72.0.0/15',
4863 'PE': '186.160.0.0/14',
4864 'PF': '123.50.64.0/18',
4865 'PG': '124.240.192.0/19',
4866 'PH': '49.144.0.0/13',
4867 'PK': '39.32.0.0/11',
4868 'PL': '83.0.0.0/11',
4869 'PM': '70.36.0.0/20',
4870 'PR': '66.50.0.0/16',
4871 'PS': '188.161.0.0/16',
4872 'PT': '85.240.0.0/13',
4873 'PW': '202.124.224.0/20',
4874 'PY': '181.120.0.0/14',
4875 'QA': '37.210.0.0/15',
4876 'RE': '102.35.0.0/16',
4877 'RO': '79.112.0.0/13',
4878 'RS': '93.86.0.0/15',
4879 'RU': '5.136.0.0/13',
4880 'RW': '41.186.0.0/16',
4881 'SA': '188.48.0.0/13',
4882 'SB': '202.1.160.0/19',
4883 'SC': '154.192.0.0/11',
4884 'SD': '102.120.0.0/13',
4885 'SE': '78.64.0.0/12',
4886 'SG': '8.128.0.0/10',
4887 'SI': '188.196.0.0/14',
4888 'SK': '78.98.0.0/15',
4889 'SL': '102.143.0.0/17',
4890 'SM': '89.186.32.0/19',
4891 'SN': '41.82.0.0/15',
4892 'SO': '154.115.192.0/18',
4893 'SR': '186.179.128.0/17',
4894 'SS': '105.235.208.0/21',
4895 'ST': '197.159.160.0/19',
4896 'SV': '168.243.0.0/16',
4897 'SX': '190.102.0.0/20',
4898 'SY': '5.0.0.0/16',
4899 'SZ': '41.84.224.0/19',
4900 'TC': '65.255.48.0/20',
4901 'TD': '154.68.128.0/19',
4902 'TG': '196.168.0.0/14',
4903 'TH': '171.96.0.0/13',
4904 'TJ': '85.9.128.0/18',
4905 'TK': '27.96.24.0/21',
4906 'TL': '180.189.160.0/20',
4907 'TM': '95.85.96.0/19',
4908 'TN': '197.0.0.0/11',
4909 'TO': '175.176.144.0/21',
4910 'TR': '78.160.0.0/11',
4911 'TT': '186.44.0.0/15',
4912 'TV': '202.2.96.0/19',
4913 'TW': '120.96.0.0/11',
4914 'TZ': '156.156.0.0/14',
4915 'UA': '37.52.0.0/14',
4916 'UG': '102.80.0.0/13',
4917 'US': '6.0.0.0/8',
4918 'UY': '167.56.0.0/13',
4919 'UZ': '84.54.64.0/18',
4920 'VA': '212.77.0.0/19',
4921 'VC': '207.191.240.0/21',
4922 'VE': '186.88.0.0/13',
4923 'VG': '66.81.192.0/20',
4924 'VI': '146.226.0.0/16',
4925 'VN': '14.160.0.0/11',
4926 'VU': '202.80.32.0/20',
4927 'WF': '117.20.32.0/21',
4928 'WS': '202.4.32.0/19',
4929 'YE': '134.35.0.0/16',
4930 'YT': '41.242.116.0/22',
4931 'ZA': '41.0.0.0/11',
4932 'ZM': '102.144.0.0/13',
4933 'ZW': '102.177.192.0/18',
4934 }
4935
4936 @classmethod
4937 def random_ipv4(cls, code_or_block):
4938 if len(code_or_block) == 2:
4939 block = cls._country_ip_map.get(code_or_block.upper())
4940 if not block:
4941 return None
4942 else:
4943 block = code_or_block
4944 addr, preflen = block.split('/')
4945 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4946 addr_max = addr_min | (0xffffffff >> int(preflen))
4947 return str(socket.inet_ntoa(
4948 struct.pack('!L', random.randint(addr_min, addr_max))))
4949
4950
4951 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4952 def __init__(self, proxies=None):
4953 # Set default handlers
4954 for type in ('http', 'https'):
4955 setattr(self, '%s_open' % type,
4956 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4957 meth(r, proxy, type))
4958 urllib.request.ProxyHandler.__init__(self, proxies)
4959
4960 def proxy_open(self, req, proxy, type):
4961 req_proxy = req.headers.get('Ytdl-request-proxy')
4962 if req_proxy is not None:
4963 proxy = req_proxy
4964 del req.headers['Ytdl-request-proxy']
4965
4966 if proxy == '__noproxy__':
4967 return None # No Proxy
4968 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4969 req.add_header('Ytdl-socks-proxy', proxy)
4970 # yt-dlp's http/https handlers do wrapping the socket with socks
4971 return None
4972 return urllib.request.ProxyHandler.proxy_open(
4973 self, req, proxy, type)
4974
4975
4976 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4977 # released into Public Domain
4978 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4979
4980 def long_to_bytes(n, blocksize=0):
4981 """long_to_bytes(n:long, blocksize:int) : string
4982 Convert a long integer to a byte string.
4983
4984 If optional blocksize is given and greater than zero, pad the front of the
4985 byte string with binary zeros so that the length is a multiple of
4986 blocksize.
4987 """
4988 # after much testing, this algorithm was deemed to be the fastest
4989 s = b''
4990 n = int(n)
4991 while n > 0:
4992 s = struct.pack('>I', n & 0xffffffff) + s
4993 n = n >> 32
4994 # strip off leading zeros
4995 for i in range(len(s)):
4996 if s[i] != b'\000'[0]:
4997 break
4998 else:
4999 # only happens when n == 0
5000 s = b'\000'
5001 i = 0
5002 s = s[i:]
5003 # add back some pad bytes. this could be done more efficiently w.r.t. the
5004 # de-padding being done above, but sigh...
5005 if blocksize > 0 and len(s) % blocksize:
5006 s = (blocksize - len(s) % blocksize) * b'\000' + s
5007 return s
5008
5009
5010 def bytes_to_long(s):
5011 """bytes_to_long(string) : long
5012 Convert a byte string to a long integer.
5013
5014 This is (essentially) the inverse of long_to_bytes().
5015 """
5016 acc = 0
5017 length = len(s)
5018 if length % 4:
5019 extra = (4 - length % 4)
5020 s = b'\000' * extra + s
5021 length = length + extra
5022 for i in range(0, length, 4):
5023 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
5024 return acc
5025
5026
5027 def ohdave_rsa_encrypt(data, exponent, modulus):
5028 '''
5029 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
5030
5031 Input:
5032 data: data to encrypt, bytes-like object
5033 exponent, modulus: parameter e and N of RSA algorithm, both integer
5034 Output: hex string of encrypted data
5035
5036 Limitation: supports one block encryption only
5037 '''
5038
5039 payload = int(binascii.hexlify(data[::-1]), 16)
5040 encrypted = pow(payload, exponent, modulus)
5041 return '%x' % encrypted
5042
5043
5044 def pkcs1pad(data, length):
5045 """
5046 Padding input data with PKCS#1 scheme
5047
5048 @param {int[]} data input data
5049 @param {int} length target length
5050 @returns {int[]} padded data
5051 """
5052 if len(data) > length - 11:
5053 raise ValueError('Input data too long for PKCS#1 padding')
5054
5055 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
5056 return [0, 2] + pseudo_random + [0] + data
5057
5058
5059 def _base_n_table(n, table):
5060 if not table and not n:
5061 raise ValueError('Either table or n must be specified')
5062 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
5063
5064 if n and n != len(table):
5065 raise ValueError(f'base {n} exceeds table length {len(table)}')
5066 return table
5067
5068
5069 def encode_base_n(num, n=None, table=None):
5070 """Convert given int to a base-n string"""
5071 table = _base_n_table(n, table)
5072 if not num:
5073 return table[0]
5074
5075 result, base = '', len(table)
5076 while num:
5077 result = table[num % base] + result
5078 num = num // base
5079 return result
5080
5081
5082 def decode_base_n(string, n=None, table=None):
5083 """Convert given base-n string to int"""
5084 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
5085 result, base = 0, len(table)
5086 for char in string:
5087 result = result * base + table[char]
5088 return result
5089
5090
5091 def decode_base(value, digits):
5092 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
5093 f'in a future version. Use {__name__}.decode_base_n instead')
5094 return decode_base_n(value, table=digits)
5095
5096
5097 def decode_packed_codes(code):
5098 mobj = re.search(PACKED_CODES_RE, code)
5099 obfuscated_code, base, count, symbols = mobj.groups()
5100 base = int(base)
5101 count = int(count)
5102 symbols = symbols.split('|')
5103 symbol_table = {}
5104
5105 while count:
5106 count -= 1
5107 base_n_count = encode_base_n(count, base)
5108 symbol_table[base_n_count] = symbols[count] or base_n_count
5109
5110 return re.sub(
5111 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5112 obfuscated_code)
5113
5114
5115 def caesar(s, alphabet, shift):
5116 if shift == 0:
5117 return s
5118 l = len(alphabet)
5119 return ''.join(
5120 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5121 for c in s)
5122
5123
5124 def rot47(s):
5125 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5126
5127
5128 def parse_m3u8_attributes(attrib):
5129 info = {}
5130 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5131 if val.startswith('"'):
5132 val = val[1:-1]
5133 info[key] = val
5134 return info
5135
5136
5137 def urshift(val, n):
5138 return val >> n if val >= 0 else (val + 0x100000000) >> n
5139
5140
5141 # Based on png2str() written by @gdkchan and improved by @yokrysty
5142 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5143 def decode_png(png_data):
5144 # Reference: https://www.w3.org/TR/PNG/
5145 header = png_data[8:]
5146
5147 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5148 raise OSError('Not a valid PNG file.')
5149
5150 int_map = {1: '>B', 2: '>H', 4: '>I'}
5151 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
5152
5153 chunks = []
5154
5155 while header:
5156 length = unpack_integer(header[:4])
5157 header = header[4:]
5158
5159 chunk_type = header[:4]
5160 header = header[4:]
5161
5162 chunk_data = header[:length]
5163 header = header[length:]
5164
5165 header = header[4:] # Skip CRC
5166
5167 chunks.append({
5168 'type': chunk_type,
5169 'length': length,
5170 'data': chunk_data
5171 })
5172
5173 ihdr = chunks[0]['data']
5174
5175 width = unpack_integer(ihdr[:4])
5176 height = unpack_integer(ihdr[4:8])
5177
5178 idat = b''
5179
5180 for chunk in chunks:
5181 if chunk['type'] == b'IDAT':
5182 idat += chunk['data']
5183
5184 if not idat:
5185 raise OSError('Unable to read PNG data.')
5186
5187 decompressed_data = bytearray(zlib.decompress(idat))
5188
5189 stride = width * 3
5190 pixels = []
5191
5192 def _get_pixel(idx):
5193 x = idx % stride
5194 y = idx // stride
5195 return pixels[y][x]
5196
5197 for y in range(height):
5198 basePos = y * (1 + stride)
5199 filter_type = decompressed_data[basePos]
5200
5201 current_row = []
5202
5203 pixels.append(current_row)
5204
5205 for x in range(stride):
5206 color = decompressed_data[1 + basePos + x]
5207 basex = y * stride + x
5208 left = 0
5209 up = 0
5210
5211 if x > 2:
5212 left = _get_pixel(basex - 3)
5213 if y > 0:
5214 up = _get_pixel(basex - stride)
5215
5216 if filter_type == 1: # Sub
5217 color = (color + left) & 0xff
5218 elif filter_type == 2: # Up
5219 color = (color + up) & 0xff
5220 elif filter_type == 3: # Average
5221 color = (color + ((left + up) >> 1)) & 0xff
5222 elif filter_type == 4: # Paeth
5223 a = left
5224 b = up
5225 c = 0
5226
5227 if x > 2 and y > 0:
5228 c = _get_pixel(basex - stride - 3)
5229
5230 p = a + b - c
5231
5232 pa = abs(p - a)
5233 pb = abs(p - b)
5234 pc = abs(p - c)
5235
5236 if pa <= pb and pa <= pc:
5237 color = (color + a) & 0xff
5238 elif pb <= pc:
5239 color = (color + b) & 0xff
5240 else:
5241 color = (color + c) & 0xff
5242
5243 current_row.append(color)
5244
5245 return width, height, pixels
5246
5247
5248 def write_xattr(path, key, value):
5249 # Windows: Write xattrs to NTFS Alternate Data Streams:
5250 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5251 if compat_os_name == 'nt':
5252 assert ':' not in key
5253 assert os.path.exists(path)
5254
5255 try:
5256 with open(f'{path}:{key}', 'wb') as f:
5257 f.write(value)
5258 except OSError as e:
5259 raise XAttrMetadataError(e.errno, e.strerror)
5260 return
5261
5262 # UNIX Method 1. Use xattrs/pyxattrs modules
5263
5264 setxattr = None
5265 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5266 # Unicode arguments are not supported in pyxattr until version 0.5.0
5267 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5268 if version_tuple(xattr.__version__) >= (0, 5, 0):
5269 setxattr = xattr.set
5270 elif xattr:
5271 setxattr = xattr.setxattr
5272
5273 if setxattr:
5274 try:
5275 setxattr(path, key, value)
5276 except OSError as e:
5277 raise XAttrMetadataError(e.errno, e.strerror)
5278 return
5279
5280 # UNIX Method 2. Use setfattr/xattr executables
5281 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5282 else 'xattr' if check_executable('xattr', ['-h']) else None)
5283 if not exe:
5284 raise XAttrUnavailableError(
5285 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5286 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5287
5288 value = value.decode()
5289 try:
5290 _, stderr, returncode = Popen.run(
5291 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5292 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5293 except OSError as e:
5294 raise XAttrMetadataError(e.errno, e.strerror)
5295 if returncode:
5296 raise XAttrMetadataError(returncode, stderr)
5297
5298
5299 def random_birthday(year_field, month_field, day_field):
5300 start_date = datetime.date(1950, 1, 1)
5301 end_date = datetime.date(1995, 12, 31)
5302 offset = random.randint(0, (end_date - start_date).days)
5303 random_date = start_date + datetime.timedelta(offset)
5304 return {
5305 year_field: str(random_date.year),
5306 month_field: str(random_date.month),
5307 day_field: str(random_date.day),
5308 }
5309
5310
5311 def find_available_port(interface=''):
5312 try:
5313 with socket.socket() as sock:
5314 sock.bind((interface, 0))
5315 return sock.getsockname()[1]
5316 except OSError:
5317 return None
5318
5319
5320 # Templates for internet shortcut files, which are plain text files.
5321 DOT_URL_LINK_TEMPLATE = '''\
5322 [InternetShortcut]
5323 URL=%(url)s
5324 '''
5325
5326 DOT_WEBLOC_LINK_TEMPLATE = '''\
5327 <?xml version="1.0" encoding="UTF-8"?>
5328 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5329 <plist version="1.0">
5330 <dict>
5331 \t<key>URL</key>
5332 \t<string>%(url)s</string>
5333 </dict>
5334 </plist>
5335 '''
5336
5337 DOT_DESKTOP_LINK_TEMPLATE = '''\
5338 [Desktop Entry]
5339 Encoding=UTF-8
5340 Name=%(filename)s
5341 Type=Link
5342 URL=%(url)s
5343 Icon=text-html
5344 '''
5345
5346 LINK_TEMPLATES = {
5347 'url': DOT_URL_LINK_TEMPLATE,
5348 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5349 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5350 }
5351
5352
5353 def iri_to_uri(iri):
5354 """
5355 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5356
5357 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5358 """
5359
5360 iri_parts = urllib.parse.urlparse(iri)
5361
5362 if '[' in iri_parts.netloc:
5363 raise ValueError('IPv6 URIs are not, yet, supported.')
5364 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5365
5366 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5367
5368 net_location = ''
5369 if iri_parts.username:
5370 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5371 if iri_parts.password is not None:
5372 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5373 net_location += '@'
5374
5375 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5376 # The 'idna' encoding produces ASCII text.
5377 if iri_parts.port is not None and iri_parts.port != 80:
5378 net_location += ':' + str(iri_parts.port)
5379
5380 return urllib.parse.urlunparse(
5381 (iri_parts.scheme,
5382 net_location,
5383
5384 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5385
5386 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5387 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5388
5389 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5390 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5391
5392 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5393
5394 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5395
5396
5397 def to_high_limit_path(path):
5398 if sys.platform in ['win32', 'cygwin']:
5399 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5400 return '\\\\?\\' + os.path.abspath(path)
5401
5402 return path
5403
5404
5405 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5406 val = traverse_obj(obj, *variadic(field))
5407 if not val if ignore is NO_DEFAULT else val in variadic(ignore):
5408 return default
5409 return template % func(val)
5410
5411
5412 def clean_podcast_url(url):
5413 return re.sub(r'''(?x)
5414 (?:
5415 (?:
5416 chtbl\.com/track|
5417 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5418 play\.podtrac\.com
5419 )/[^/]+|
5420 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5421 flex\.acast\.com|
5422 pd(?:
5423 cn\.co| # https://podcorn.com/analytics-prefix/
5424 st\.fm # https://podsights.com/docs/
5425 )/e
5426 )/''', '', url)
5427
5428
5429 _HEX_TABLE = '0123456789abcdef'
5430
5431
5432 def random_uuidv4():
5433 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5434
5435
5436 def make_dir(path, to_screen=None):
5437 try:
5438 dn = os.path.dirname(path)
5439 if dn:
5440 os.makedirs(dn, exist_ok=True)
5441 return True
5442 except OSError as err:
5443 if callable(to_screen) is not None:
5444 to_screen('unable to create directory ' + error_to_compat_str(err))
5445 return False
5446
5447
5448 def get_executable_path():
5449 from .update import _get_variant_and_executable_path
5450
5451 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5452
5453
5454 def get_user_config_dirs(package_name):
5455 # .config (e.g. ~/.config/package_name)
5456 xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config')
5457 yield os.path.join(xdg_config_home, package_name)
5458
5459 # appdata (%APPDATA%/package_name)
5460 appdata_dir = os.getenv('appdata')
5461 if appdata_dir:
5462 yield os.path.join(appdata_dir, package_name)
5463
5464 # home (~/.package_name)
5465 yield os.path.join(compat_expanduser('~'), f'.{package_name}')
5466
5467
5468 def get_system_config_dirs(package_name):
5469 # /etc/package_name
5470 yield os.path.join('/etc', package_name)
5471
5472
5473 def traverse_obj(
5474 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
5475 casesense=True, is_user_input=False, traverse_string=False):
5476 """
5477 Safely traverse nested `dict`s and `Iterable`s
5478
5479 >>> obj = [{}, {"key": "value"}]
5480 >>> traverse_obj(obj, (1, "key"))
5481 "value"
5482
5483 Each of the provided `paths` is tested and the first producing a valid result will be returned.
5484 The next path will also be tested if the path branched but no results could be found.
5485 Supported values for traversal are `Mapping`, `Iterable` and `re.Match`.
5486 Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded.
5487
5488 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5489
5490 The keys in the path can be one of:
5491 - `None`: Return the current object.
5492 - `set`: Requires the only item in the set to be a type or function,
5493 like `{type}`/`{func}`. If a `type`, returns only values
5494 of this type. If a function, returns `func(obj)`.
5495 - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`.
5496 - `slice`: Branch out and return all values in `obj[key]`.
5497 - `Ellipsis`: Branch out and return a list of all values.
5498 - `tuple`/`list`: Branch out and return a list of all matching values.
5499 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5500 - `function`: Branch out and return values filtered by the function.
5501 Read as: `[value for key, value in obj if function(key, value)]`.
5502 For `Iterable`s, `key` is the index of the value.
5503 For `re.Match`es, `key` is the group number (0 = full match)
5504 as well as additionally any group names, if given.
5505 - `dict` Transform the current object and return a matching dict.
5506 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5507
5508 `tuple`, `list`, and `dict` all support nested paths and branches.
5509
5510 @params paths Paths which to traverse by.
5511 @param default Value to return if the paths do not match.
5512 If the last key in the path is a `dict`, it will apply to each value inside
5513 the dict instead, depth first. Try to avoid if using nested `dict` keys.
5514 @param expected_type If a `type`, only accept final values of this type.
5515 If any other callable, try to call the function on each result.
5516 If the last key in the path is a `dict`, it will apply to each value inside
5517 the dict instead, recursively. This does respect branching paths.
5518 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5519 @param casesense If `False`, consider string dictionary keys as case insensitive.
5520
5521 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5522
5523 @param is_user_input Whether the keys are generated from user input.
5524 If `True` strings get converted to `int`/`slice` if needed.
5525 @param traverse_string Whether to traverse into objects as strings.
5526 If `True`, any non-compatible object will first be
5527 converted into a string and then traversed into.
5528 The return value of that path will be a string instead,
5529 not respecting any further branching.
5530
5531
5532 @returns The result of the object traversal.
5533 If successful, `get_all=True`, and the path branches at least once,
5534 then a list of results is returned instead.
5535 If no `default` is given and the last path branches, a `list` of results
5536 is always returned. If a path ends on a `dict` that result will always be a `dict`.
5537 """
5538 casefold = lambda k: k.casefold() if isinstance(k, str) else k
5539
5540 if isinstance(expected_type, type):
5541 type_test = lambda val: val if isinstance(val, expected_type) else None
5542 else:
5543 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5544
5545 def apply_key(key, obj, is_last):
5546 branching = False
5547 result = None
5548
5549 if obj is None and traverse_string:
5550 if key is ... or callable(key) or isinstance(key, slice):
5551 branching = True
5552 result = ()
5553
5554 elif key is None:
5555 result = obj
5556
5557 elif isinstance(key, set):
5558 assert len(key) == 1, 'Set should only be used to wrap a single item'
5559 item = next(iter(key))
5560 if isinstance(item, type):
5561 if isinstance(obj, item):
5562 result = obj
5563 else:
5564 result = try_call(item, args=(obj,))
5565
5566 elif isinstance(key, (list, tuple)):
5567 branching = True
5568 result = itertools.chain.from_iterable(
5569 apply_path(obj, branch, is_last)[0] for branch in key)
5570
5571 elif key is ...:
5572 branching = True
5573 if isinstance(obj, collections.abc.Mapping):
5574 result = obj.values()
5575 elif is_iterable_like(obj):
5576 result = obj
5577 elif isinstance(obj, re.Match):
5578 result = obj.groups()
5579 elif traverse_string:
5580 branching = False
5581 result = str(obj)
5582 else:
5583 result = ()
5584
5585 elif callable(key):
5586 branching = True
5587 if isinstance(obj, collections.abc.Mapping):
5588 iter_obj = obj.items()
5589 elif is_iterable_like(obj):
5590 iter_obj = enumerate(obj)
5591 elif isinstance(obj, re.Match):
5592 iter_obj = itertools.chain(
5593 enumerate((obj.group(), *obj.groups())),
5594 obj.groupdict().items())
5595 elif traverse_string:
5596 branching = False
5597 iter_obj = enumerate(str(obj))
5598 else:
5599 iter_obj = ()
5600
5601 result = (v for k, v in iter_obj if try_call(key, args=(k, v)))
5602 if not branching: # string traversal
5603 result = ''.join(result)
5604
5605 elif isinstance(key, dict):
5606 iter_obj = ((k, _traverse_obj(obj, v, False, is_last)) for k, v in key.items())
5607 result = {
5608 k: v if v is not None else default for k, v in iter_obj
5609 if v is not None or default is not NO_DEFAULT
5610 } or None
5611
5612 elif isinstance(obj, collections.abc.Mapping):
5613 result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else
5614 next((v for k, v in obj.items() if casefold(k) == key), None))
5615
5616 elif isinstance(obj, re.Match):
5617 if isinstance(key, int) or casesense:
5618 with contextlib.suppress(IndexError):
5619 result = obj.group(key)
5620
5621 elif isinstance(key, str):
5622 result = next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5623
5624 elif isinstance(key, (int, slice)):
5625 if is_iterable_like(obj, collections.abc.Sequence):
5626 branching = isinstance(key, slice)
5627 with contextlib.suppress(IndexError):
5628 result = obj[key]
5629 elif traverse_string:
5630 with contextlib.suppress(IndexError):
5631 result = str(obj)[key]
5632
5633 return branching, result if branching else (result,)
5634
5635 def lazy_last(iterable):
5636 iterator = iter(iterable)
5637 prev = next(iterator, NO_DEFAULT)
5638 if prev is NO_DEFAULT:
5639 return
5640
5641 for item in iterator:
5642 yield False, prev
5643 prev = item
5644
5645 yield True, prev
5646
5647 def apply_path(start_obj, path, test_type):
5648 objs = (start_obj,)
5649 has_branched = False
5650
5651 key = None
5652 for last, key in lazy_last(variadic(path, (str, bytes, dict, set))):
5653 if is_user_input and isinstance(key, str):
5654 if key == ':':
5655 key = ...
5656 elif ':' in key:
5657 key = slice(*map(int_or_none, key.split(':')))
5658 elif int_or_none(key) is not None:
5659 key = int(key)
5660
5661 if not casesense and isinstance(key, str):
5662 key = key.casefold()
5663
5664 if __debug__ and callable(key):
5665 # Verify function signature
5666 inspect.signature(key).bind(None, None)
5667
5668 new_objs = []
5669 for obj in objs:
5670 branching, results = apply_key(key, obj, last)
5671 has_branched |= branching
5672 new_objs.append(results)
5673
5674 objs = itertools.chain.from_iterable(new_objs)
5675
5676 if test_type and not isinstance(key, (dict, list, tuple)):
5677 objs = map(type_test, objs)
5678
5679 return objs, has_branched, isinstance(key, dict)
5680
5681 def _traverse_obj(obj, path, allow_empty, test_type):
5682 results, has_branched, is_dict = apply_path(obj, path, test_type)
5683 results = LazyList(item for item in results if item not in (None, {}))
5684 if get_all and has_branched:
5685 if results:
5686 return results.exhaust()
5687 if allow_empty:
5688 return [] if default is NO_DEFAULT else default
5689 return None
5690
5691 return results[0] if results else {} if allow_empty and is_dict else None
5692
5693 for index, path in enumerate(paths, 1):
5694 result = _traverse_obj(obj, path, index == len(paths), True)
5695 if result is not None:
5696 return result
5697
5698 return None if default is NO_DEFAULT else default
5699
5700
5701 def traverse_dict(dictn, keys, casesense=True):
5702 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5703 f'in a future version. Use "{__name__}.traverse_obj" instead')
5704 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5705
5706
5707 def get_first(obj, *paths, **kwargs):
5708 return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False)
5709
5710
5711 def time_seconds(**kwargs):
5712 """
5713 Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z)
5714 """
5715 return time.time() + datetime.timedelta(**kwargs).total_seconds()
5716
5717
5718 # create a JSON Web Signature (jws) with HS256 algorithm
5719 # the resulting format is in JWS Compact Serialization
5720 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5721 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5722 def jwt_encode_hs256(payload_data, key, headers={}):
5723 header_data = {
5724 'alg': 'HS256',
5725 'typ': 'JWT',
5726 }
5727 if headers:
5728 header_data.update(headers)
5729 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5730 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5731 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5732 signature_b64 = base64.b64encode(h.digest())
5733 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5734 return token
5735
5736
5737 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5738 def jwt_decode_hs256(jwt):
5739 header_b64, payload_b64, signature_b64 = jwt.split('.')
5740 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5741 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5742 return payload_data
5743
5744
5745 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5746
5747
5748 @functools.cache
5749 def supports_terminal_sequences(stream):
5750 if compat_os_name == 'nt':
5751 if not WINDOWS_VT_MODE:
5752 return False
5753 elif not os.getenv('TERM'):
5754 return False
5755 try:
5756 return stream.isatty()
5757 except BaseException:
5758 return False
5759
5760
5761 def windows_enable_vt_mode():
5762 """Ref: https://bugs.python.org/issue30075 """
5763 if get_windows_version() < (10, 0, 10586):
5764 return
5765
5766 import ctypes
5767 import ctypes.wintypes
5768 import msvcrt
5769
5770 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
5771
5772 dll = ctypes.WinDLL('kernel32', use_last_error=False)
5773 handle = os.open('CONOUT$', os.O_RDWR)
5774 try:
5775 h_out = ctypes.wintypes.HANDLE(msvcrt.get_osfhandle(handle))
5776 dw_original_mode = ctypes.wintypes.DWORD()
5777 success = dll.GetConsoleMode(h_out, ctypes.byref(dw_original_mode))
5778 if not success:
5779 raise Exception('GetConsoleMode failed')
5780
5781 success = dll.SetConsoleMode(h_out, ctypes.wintypes.DWORD(
5782 dw_original_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING))
5783 if not success:
5784 raise Exception('SetConsoleMode failed')
5785 finally:
5786 os.close(handle)
5787
5788 global WINDOWS_VT_MODE
5789 WINDOWS_VT_MODE = True
5790 supports_terminal_sequences.cache_clear()
5791
5792
5793 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5794
5795
5796 def remove_terminal_sequences(string):
5797 return _terminal_sequences_re.sub('', string)
5798
5799
5800 def number_of_digits(number):
5801 return len('%d' % number)
5802
5803
5804 def join_nonempty(*values, delim='-', from_dict=None):
5805 if from_dict is not None:
5806 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5807 return delim.join(map(str, filter(None, values)))
5808
5809
5810 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5811 """
5812 Find the largest format dimensions in terms of video width and, for each thumbnail:
5813 * Modify the URL: Match the width with the provided regex and replace with the former width
5814 * Update dimensions
5815
5816 This function is useful with video services that scale the provided thumbnails on demand
5817 """
5818 _keys = ('width', 'height')
5819 max_dimensions = max(
5820 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5821 default=(0, 0))
5822 if not max_dimensions[0]:
5823 return thumbnails
5824 return [
5825 merge_dicts(
5826 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5827 dict(zip(_keys, max_dimensions)), thumbnail)
5828 for thumbnail in thumbnails
5829 ]
5830
5831
5832 def parse_http_range(range):
5833 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5834 if not range:
5835 return None, None, None
5836 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5837 if not crg:
5838 return None, None, None
5839 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5840
5841
5842 def read_stdin(what):
5843 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5844 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5845 return sys.stdin
5846
5847
5848 def determine_file_encoding(data):
5849 """
5850 Detect the text encoding used
5851 @returns (encoding, bytes to skip)
5852 """
5853
5854 # BOM marks are given priority over declarations
5855 for bom, enc in BOMS:
5856 if data.startswith(bom):
5857 return enc, len(bom)
5858
5859 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5860 # We ignore the endianness to get a good enough match
5861 data = data.replace(b'\0', b'')
5862 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5863 return mobj.group(1).decode() if mobj else None, 0
5864
5865
5866 class Config:
5867 own_args = None
5868 parsed_args = None
5869 filename = None
5870 __initialized = False
5871
5872 def __init__(self, parser, label=None):
5873 self.parser, self.label = parser, label
5874 self._loaded_paths, self.configs = set(), []
5875
5876 def init(self, args=None, filename=None):
5877 assert not self.__initialized
5878 self.own_args, self.filename = args, filename
5879 return self.load_configs()
5880
5881 def load_configs(self):
5882 directory = ''
5883 if self.filename:
5884 location = os.path.realpath(self.filename)
5885 directory = os.path.dirname(location)
5886 if location in self._loaded_paths:
5887 return False
5888 self._loaded_paths.add(location)
5889
5890 self.__initialized = True
5891 opts, _ = self.parser.parse_known_args(self.own_args)
5892 self.parsed_args = self.own_args
5893 for location in opts.config_locations or []:
5894 if location == '-':
5895 if location in self._loaded_paths:
5896 continue
5897 self._loaded_paths.add(location)
5898 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5899 continue
5900 location = os.path.join(directory, expand_path(location))
5901 if os.path.isdir(location):
5902 location = os.path.join(location, 'yt-dlp.conf')
5903 if not os.path.exists(location):
5904 self.parser.error(f'config location {location} does not exist')
5905 self.append_config(self.read_file(location), location)
5906 return True
5907
5908 def __str__(self):
5909 label = join_nonempty(
5910 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5911 delim=' ')
5912 return join_nonempty(
5913 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5914 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5915 delim='\n')
5916
5917 @staticmethod
5918 def read_file(filename, default=[]):
5919 try:
5920 optionf = open(filename, 'rb')
5921 except OSError:
5922 return default # silently skip if file is not present
5923 try:
5924 enc, skip = determine_file_encoding(optionf.read(512))
5925 optionf.seek(skip, io.SEEK_SET)
5926 except OSError:
5927 enc = None # silently skip read errors
5928 try:
5929 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5930 contents = optionf.read().decode(enc or preferredencoding())
5931 res = shlex.split(contents, comments=True)
5932 except Exception as err:
5933 raise ValueError(f'Unable to parse "{filename}": {err}')
5934 finally:
5935 optionf.close()
5936 return res
5937
5938 @staticmethod
5939 def hide_login_info(opts):
5940 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5941 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5942
5943 def _scrub_eq(o):
5944 m = eqre.match(o)
5945 if m:
5946 return m.group('key') + '=PRIVATE'
5947 else:
5948 return o
5949
5950 opts = list(map(_scrub_eq, opts))
5951 for idx, opt in enumerate(opts):
5952 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5953 opts[idx + 1] = 'PRIVATE'
5954 return opts
5955
5956 def append_config(self, *args, label=None):
5957 config = type(self)(self.parser, label)
5958 config._loaded_paths = self._loaded_paths
5959 if config.init(*args):
5960 self.configs.append(config)
5961
5962 @property
5963 def all_args(self):
5964 for config in reversed(self.configs):
5965 yield from config.all_args
5966 yield from self.parsed_args or []
5967
5968 def parse_known_args(self, **kwargs):
5969 return self.parser.parse_known_args(self.all_args, **kwargs)
5970
5971 def parse_args(self):
5972 return self.parser.parse_args(self.all_args)
5973
5974
5975 class WebSocketsWrapper:
5976 """Wraps websockets module to use in non-async scopes"""
5977 pool = None
5978
5979 def __init__(self, url, headers=None, connect=True):
5980 self.loop = asyncio.new_event_loop()
5981 # XXX: "loop" is deprecated
5982 self.conn = websockets.connect(
5983 url, extra_headers=headers, ping_interval=None,
5984 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5985 if connect:
5986 self.__enter__()
5987 atexit.register(self.__exit__, None, None, None)
5988
5989 def __enter__(self):
5990 if not self.pool:
5991 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5992 return self
5993
5994 def send(self, *args):
5995 self.run_with_loop(self.pool.send(*args), self.loop)
5996
5997 def recv(self, *args):
5998 return self.run_with_loop(self.pool.recv(*args), self.loop)
5999
6000 def __exit__(self, type, value, traceback):
6001 try:
6002 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
6003 finally:
6004 self.loop.close()
6005 self._cancel_all_tasks(self.loop)
6006
6007 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
6008 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
6009 @staticmethod
6010 def run_with_loop(main, loop):
6011 if not asyncio.iscoroutine(main):
6012 raise ValueError(f'a coroutine was expected, got {main!r}')
6013
6014 try:
6015 return loop.run_until_complete(main)
6016 finally:
6017 loop.run_until_complete(loop.shutdown_asyncgens())
6018 if hasattr(loop, 'shutdown_default_executor'):
6019 loop.run_until_complete(loop.shutdown_default_executor())
6020
6021 @staticmethod
6022 def _cancel_all_tasks(loop):
6023 to_cancel = asyncio.all_tasks(loop)
6024
6025 if not to_cancel:
6026 return
6027
6028 for task in to_cancel:
6029 task.cancel()
6030
6031 # XXX: "loop" is removed in python 3.10+
6032 loop.run_until_complete(
6033 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
6034
6035 for task in to_cancel:
6036 if task.cancelled():
6037 continue
6038 if task.exception() is not None:
6039 loop.call_exception_handler({
6040 'message': 'unhandled exception during asyncio.run() shutdown',
6041 'exception': task.exception(),
6042 'task': task,
6043 })
6044
6045
6046 def merge_headers(*dicts):
6047 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
6048 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
6049
6050
6051 def cached_method(f):
6052 """Cache a method"""
6053 signature = inspect.signature(f)
6054
6055 @functools.wraps(f)
6056 def wrapper(self, *args, **kwargs):
6057 bound_args = signature.bind(self, *args, **kwargs)
6058 bound_args.apply_defaults()
6059 key = tuple(bound_args.arguments.values())[1:]
6060
6061 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
6062 if key not in cache:
6063 cache[key] = f(self, *args, **kwargs)
6064 return cache[key]
6065 return wrapper
6066
6067
6068 class classproperty:
6069 """property access for class methods with optional caching"""
6070 def __new__(cls, func=None, *args, **kwargs):
6071 if not func:
6072 return functools.partial(cls, *args, **kwargs)
6073 return super().__new__(cls)
6074
6075 def __init__(self, func, *, cache=False):
6076 functools.update_wrapper(self, func)
6077 self.func = func
6078 self._cache = {} if cache else None
6079
6080 def __get__(self, _, cls):
6081 if self._cache is None:
6082 return self.func(cls)
6083 elif cls not in self._cache:
6084 self._cache[cls] = self.func(cls)
6085 return self._cache[cls]
6086
6087
6088 class function_with_repr:
6089 def __init__(self, func, repr_=None):
6090 functools.update_wrapper(self, func)
6091 self.func, self.__repr = func, repr_
6092
6093 def __call__(self, *args, **kwargs):
6094 return self.func(*args, **kwargs)
6095
6096 def __repr__(self):
6097 if self.__repr:
6098 return self.__repr
6099 return f'{self.func.__module__}.{self.func.__qualname__}'
6100
6101
6102 class Namespace(types.SimpleNamespace):
6103 """Immutable namespace"""
6104
6105 def __iter__(self):
6106 return iter(self.__dict__.values())
6107
6108 @property
6109 def items_(self):
6110 return self.__dict__.items()
6111
6112
6113 MEDIA_EXTENSIONS = Namespace(
6114 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
6115 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
6116 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
6117 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma', 'weba'),
6118 thumbnails=('jpg', 'png', 'webp'),
6119 storyboards=('mhtml', ),
6120 subtitles=('srt', 'vtt', 'ass', 'lrc'),
6121 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
6122 )
6123 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
6124 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
6125
6126 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
6127
6128
6129 class RetryManager:
6130 """Usage:
6131 for retry in RetryManager(...):
6132 try:
6133 ...
6134 except SomeException as err:
6135 retry.error = err
6136 continue
6137 """
6138 attempt, _error = 0, None
6139
6140 def __init__(self, _retries, _error_callback, **kwargs):
6141 self.retries = _retries or 0
6142 self.error_callback = functools.partial(_error_callback, **kwargs)
6143
6144 def _should_retry(self):
6145 return self._error is not NO_DEFAULT and self.attempt <= self.retries
6146
6147 @property
6148 def error(self):
6149 if self._error is NO_DEFAULT:
6150 return None
6151 return self._error
6152
6153 @error.setter
6154 def error(self, value):
6155 self._error = value
6156
6157 def __iter__(self):
6158 while self._should_retry():
6159 self.error = NO_DEFAULT
6160 self.attempt += 1
6161 yield self
6162 if self.error:
6163 self.error_callback(self.error, self.attempt, self.retries)
6164
6165 @staticmethod
6166 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
6167 """Utility function for reporting retries"""
6168 if count > retries:
6169 if error:
6170 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
6171 raise e
6172
6173 if not count:
6174 return warn(e)
6175 elif isinstance(e, ExtractorError):
6176 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
6177 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
6178
6179 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
6180 if delay:
6181 info(f'Sleeping {delay:.2f} seconds ...')
6182 time.sleep(delay)
6183
6184
6185 def make_archive_id(ie, video_id):
6186 ie_key = ie if isinstance(ie, str) else ie.ie_key()
6187 return f'{ie_key.lower()} {video_id}'
6188
6189
6190 def truncate_string(s, left, right=0):
6191 assert left > 3 and right >= 0
6192 if s is None or len(s) <= left + right:
6193 return s
6194 return f'{s[:left-3]}...{s[-right:] if right else ""}'
6195
6196
6197 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
6198 assert 'all' in alias_dict, '"all" alias is required'
6199 requested = list(start or [])
6200 for val in options:
6201 discard = val.startswith('-')
6202 if discard:
6203 val = val[1:]
6204
6205 if val in alias_dict:
6206 val = alias_dict[val] if not discard else [
6207 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
6208 # NB: Do not allow regex in aliases for performance
6209 requested = orderedSet_from_options(val, alias_dict, start=requested)
6210 continue
6211
6212 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
6213 else [val] if val in alias_dict['all'] else None)
6214 if current is None:
6215 raise ValueError(val)
6216
6217 if discard:
6218 for item in current:
6219 while item in requested:
6220 requested.remove(item)
6221 else:
6222 requested.extend(current)
6223
6224 return orderedSet(requested)
6225
6226
6227 class FormatSorter:
6228 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6229
6230 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6231 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6232 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6233 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6234 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6235 'fps', 'fs_approx', 'source', 'id')
6236
6237 settings = {
6238 'vcodec': {'type': 'ordered', 'regex': True,
6239 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6240 'acodec': {'type': 'ordered', 'regex': True,
6241 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'ac-?4', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6242 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6243 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6244 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6245 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6246 'vext': {'type': 'ordered', 'field': 'video_ext',
6247 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6248 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
6249 'aext': {'type': 'ordered', 'regex': True, 'field': 'audio_ext',
6250 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'web[am]', '', 'none'),
6251 'order_free': ('ogg', 'opus', 'web[am]', 'mp3', 'm4a', 'aac', '', 'none')},
6252 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6253 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6254 'field': ('vcodec', 'acodec'),
6255 'function': lambda it: int(any(v != 'none' for v in it))},
6256 'ie_pref': {'priority': True, 'type': 'extractor'},
6257 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6258 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6259 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6260 'quality': {'convert': 'float', 'default': -1},
6261 'filesize': {'convert': 'bytes'},
6262 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6263 'id': {'convert': 'string', 'field': 'format_id'},
6264 'height': {'convert': 'float_none'},
6265 'width': {'convert': 'float_none'},
6266 'fps': {'convert': 'float_none'},
6267 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6268 'tbr': {'convert': 'float_none'},
6269 'vbr': {'convert': 'float_none'},
6270 'abr': {'convert': 'float_none'},
6271 'asr': {'convert': 'float_none'},
6272 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6273
6274 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6275 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6276 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6277 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6278 'res': {'type': 'multiple', 'field': ('height', 'width'),
6279 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6280
6281 # Actual field names
6282 'format_id': {'type': 'alias', 'field': 'id'},
6283 'preference': {'type': 'alias', 'field': 'ie_pref'},
6284 'language_preference': {'type': 'alias', 'field': 'lang'},
6285 'source_preference': {'type': 'alias', 'field': 'source'},
6286 'protocol': {'type': 'alias', 'field': 'proto'},
6287 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6288 'audio_channels': {'type': 'alias', 'field': 'channels'},
6289
6290 # Deprecated
6291 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6292 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6293 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6294 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6295 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6296 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6297 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6298 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6299 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6300 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6301 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6302 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6303 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6304 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6305 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6306 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6307 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6308 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6309 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6310 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6311 }
6312
6313 def __init__(self, ydl, field_preference):
6314 self.ydl = ydl
6315 self._order = []
6316 self.evaluate_params(self.ydl.params, field_preference)
6317 if ydl.params.get('verbose'):
6318 self.print_verbose_info(self.ydl.write_debug)
6319
6320 def _get_field_setting(self, field, key):
6321 if field not in self.settings:
6322 if key in ('forced', 'priority'):
6323 return False
6324 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6325 'deprecated and may be removed in a future version')
6326 self.settings[field] = {}
6327 propObj = self.settings[field]
6328 if key not in propObj:
6329 type = propObj.get('type')
6330 if key == 'field':
6331 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6332 elif key == 'convert':
6333 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6334 else:
6335 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6336 propObj[key] = default
6337 return propObj[key]
6338
6339 def _resolve_field_value(self, field, value, convertNone=False):
6340 if value is None:
6341 if not convertNone:
6342 return None
6343 else:
6344 value = value.lower()
6345 conversion = self._get_field_setting(field, 'convert')
6346 if conversion == 'ignore':
6347 return None
6348 if conversion == 'string':
6349 return value
6350 elif conversion == 'float_none':
6351 return float_or_none(value)
6352 elif conversion == 'bytes':
6353 return parse_bytes(value)
6354 elif conversion == 'order':
6355 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6356 use_regex = self._get_field_setting(field, 'regex')
6357 list_length = len(order_list)
6358 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6359 if use_regex and value is not None:
6360 for i, regex in enumerate(order_list):
6361 if regex and re.match(regex, value):
6362 return list_length - i
6363 return list_length - empty_pos # not in list
6364 else: # not regex or value = None
6365 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6366 else:
6367 if value.isnumeric():
6368 return float(value)
6369 else:
6370 self.settings[field]['convert'] = 'string'
6371 return value
6372
6373 def evaluate_params(self, params, sort_extractor):
6374 self._use_free_order = params.get('prefer_free_formats', False)
6375 self._sort_user = params.get('format_sort', [])
6376 self._sort_extractor = sort_extractor
6377
6378 def add_item(field, reverse, closest, limit_text):
6379 field = field.lower()
6380 if field in self._order:
6381 return
6382 self._order.append(field)
6383 limit = self._resolve_field_value(field, limit_text)
6384 data = {
6385 'reverse': reverse,
6386 'closest': False if limit is None else closest,
6387 'limit_text': limit_text,
6388 'limit': limit}
6389 if field in self.settings:
6390 self.settings[field].update(data)
6391 else:
6392 self.settings[field] = data
6393
6394 sort_list = (
6395 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6396 + (tuple() if params.get('format_sort_force', False)
6397 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6398 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6399
6400 for item in sort_list:
6401 match = re.match(self.regex, item)
6402 if match is None:
6403 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6404 field = match.group('field')
6405 if field is None:
6406 continue
6407 if self._get_field_setting(field, 'type') == 'alias':
6408 alias, field = field, self._get_field_setting(field, 'field')
6409 if self._get_field_setting(alias, 'deprecated'):
6410 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6411 f'be removed in a future version. Please use {field} instead')
6412 reverse = match.group('reverse') is not None
6413 closest = match.group('separator') == '~'
6414 limit_text = match.group('limit')
6415
6416 has_limit = limit_text is not None
6417 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6418 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6419
6420 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6421 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6422 limit_count = len(limits)
6423 for (i, f) in enumerate(fields):
6424 add_item(f, reverse, closest,
6425 limits[i] if i < limit_count
6426 else limits[0] if has_limit and not has_multiple_limits
6427 else None)
6428
6429 def print_verbose_info(self, write_debug):
6430 if self._sort_user:
6431 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6432 if self._sort_extractor:
6433 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6434 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6435 '+' if self._get_field_setting(field, 'reverse') else '', field,
6436 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6437 self._get_field_setting(field, 'limit_text'),
6438 self._get_field_setting(field, 'limit'))
6439 if self._get_field_setting(field, 'limit_text') is not None else '')
6440 for field in self._order if self._get_field_setting(field, 'visible')]))
6441
6442 def _calculate_field_preference_from_value(self, format, field, type, value):
6443 reverse = self._get_field_setting(field, 'reverse')
6444 closest = self._get_field_setting(field, 'closest')
6445 limit = self._get_field_setting(field, 'limit')
6446
6447 if type == 'extractor':
6448 maximum = self._get_field_setting(field, 'max')
6449 if value is None or (maximum is not None and value >= maximum):
6450 value = -1
6451 elif type == 'boolean':
6452 in_list = self._get_field_setting(field, 'in_list')
6453 not_in_list = self._get_field_setting(field, 'not_in_list')
6454 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6455 elif type == 'ordered':
6456 value = self._resolve_field_value(field, value, True)
6457
6458 # try to convert to number
6459 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6460 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6461 if is_num:
6462 value = val_num
6463
6464 return ((-10, 0) if value is None
6465 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6466 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6467 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6468 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6469 else (-1, value, 0))
6470
6471 def _calculate_field_preference(self, format, field):
6472 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6473 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6474 if type == 'multiple':
6475 type = 'field' # Only 'field' is allowed in multiple for now
6476 actual_fields = self._get_field_setting(field, 'field')
6477
6478 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6479 else:
6480 value = get_value(field)
6481 return self._calculate_field_preference_from_value(format, field, type, value)
6482
6483 def calculate_preference(self, format):
6484 # Determine missing protocol
6485 if not format.get('protocol'):
6486 format['protocol'] = determine_protocol(format)
6487
6488 # Determine missing ext
6489 if not format.get('ext') and 'url' in format:
6490 format['ext'] = determine_ext(format['url'])
6491 if format.get('vcodec') == 'none':
6492 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6493 format['video_ext'] = 'none'
6494 else:
6495 format['video_ext'] = format['ext']
6496 format['audio_ext'] = 'none'
6497 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6498 # format['preference'] = -1000
6499
6500 if format.get('preference') is None and format.get('ext') == 'flv' and re.match('[hx]265|he?vc?', format.get('vcodec') or ''):
6501 # HEVC-over-FLV is out-of-spec by FLV's original spec
6502 # ref. https://trac.ffmpeg.org/ticket/6389
6503 # ref. https://github.com/yt-dlp/yt-dlp/pull/5821
6504 format['preference'] = -100
6505
6506 # Determine missing bitrates
6507 if format.get('tbr') is None:
6508 if format.get('vbr') is not None and format.get('abr') is not None:
6509 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6510 else:
6511 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6512 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6513 if format.get('acodec') != 'none' and format.get('abr') is None:
6514 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6515
6516 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6517
6518
6519 # Deprecated
6520 has_certifi = bool(certifi)
6521 has_websockets = bool(websockets)
6522
6523
6524 def load_plugins(name, suffix, namespace):
6525 from .plugins import load_plugins
6526 ret = load_plugins(name, suffix)
6527 namespace.update(ret)
6528 return ret