]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[build, devscripts] Add devscript to set a build variant
[yt-dlp.git] / yt_dlp / utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import datetime
10 import email.header
11 import email.utils
12 import errno
13 import gzip
14 import hashlib
15 import hmac
16 import html.entities
17 import html.parser
18 import http.client
19 import http.cookiejar
20 import importlib.util
21 import inspect
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import mimetypes
28 import operator
29 import os
30 import platform
31 import random
32 import re
33 import shlex
34 import socket
35 import ssl
36 import struct
37 import subprocess
38 import sys
39 import tempfile
40 import time
41 import traceback
42 import types
43 import unicodedata
44 import urllib.error
45 import urllib.parse
46 import urllib.request
47 import xml.etree.ElementTree
48 import zlib
49
50 from .compat import functools # isort: split
51 from .compat import (
52 compat_etree_fromstring,
53 compat_expanduser,
54 compat_HTMLParseError,
55 compat_os_name,
56 compat_shlex_quote,
57 )
58 from .dependencies import brotli, certifi, websockets, xattr
59 from .socks import ProxyType, sockssocket
60
61
62 def register_socks_protocols():
63 # "Register" SOCKS protocols
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
69
70
71 # This is not clearly defined otherwise
72 compiled_regex_type = type(re.compile(''))
73
74
75 def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
120 SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122 ]
123 if brotli:
124 SUPPORTED_ENCODINGS.append('br')
125
126 std_headers = {
127 'User-Agent': random_user_agent(),
128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
129 'Accept-Language': 'en-us,en;q=0.5',
130 'Sec-Fetch-Mode': 'navigate',
131 }
132
133
134 USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136 }
137
138
139 NO_DEFAULT = object()
140 IDENTITY = lambda x: x
141
142 ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
146 MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
151 }
152
153 # needed for sanitizing filenames in restricted mode
154 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
155 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
156 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
157
158 DATE_FORMATS = (
159 '%d %B %Y',
160 '%d %b %Y',
161 '%B %d %Y',
162 '%B %dst %Y',
163 '%B %dnd %Y',
164 '%B %drd %Y',
165 '%B %dth %Y',
166 '%b %d %Y',
167 '%b %dst %Y',
168 '%b %dnd %Y',
169 '%b %drd %Y',
170 '%b %dth %Y',
171 '%b %dst %Y %I:%M',
172 '%b %dnd %Y %I:%M',
173 '%b %drd %Y %I:%M',
174 '%b %dth %Y %I:%M',
175 '%Y %m %d',
176 '%Y-%m-%d',
177 '%Y.%m.%d.',
178 '%Y/%m/%d',
179 '%Y/%m/%d %H:%M',
180 '%Y/%m/%d %H:%M:%S',
181 '%Y%m%d%H%M',
182 '%Y%m%d%H%M%S',
183 '%Y%m%d',
184 '%Y-%m-%d %H:%M',
185 '%Y-%m-%d %H:%M:%S',
186 '%Y-%m-%d %H:%M:%S.%f',
187 '%Y-%m-%d %H:%M:%S:%f',
188 '%d.%m.%Y %H:%M',
189 '%d.%m.%Y %H.%M',
190 '%Y-%m-%dT%H:%M:%SZ',
191 '%Y-%m-%dT%H:%M:%S.%fZ',
192 '%Y-%m-%dT%H:%M:%S.%f0Z',
193 '%Y-%m-%dT%H:%M:%S',
194 '%Y-%m-%dT%H:%M:%S.%f',
195 '%Y-%m-%dT%H:%M',
196 '%b %d %Y at %H:%M',
197 '%b %d %Y at %H:%M:%S',
198 '%B %d %Y at %H:%M',
199 '%B %d %Y at %H:%M:%S',
200 '%H:%M %d-%b-%Y',
201 )
202
203 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
204 DATE_FORMATS_DAY_FIRST.extend([
205 '%d-%m-%Y',
206 '%d.%m.%Y',
207 '%d.%m.%y',
208 '%d/%m/%Y',
209 '%d/%m/%y',
210 '%d/%m/%Y %H:%M:%S',
211 '%d-%m-%Y %H:%M',
212 ])
213
214 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
215 DATE_FORMATS_MONTH_FIRST.extend([
216 '%m-%d-%Y',
217 '%m.%d.%Y',
218 '%m/%d/%Y',
219 '%m/%d/%y',
220 '%m/%d/%Y %H:%M:%S',
221 ])
222
223 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
224 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
225
226 NUMBER_RE = r'\d+(?:\.\d+)?'
227
228
229 @functools.cache
230 def preferredencoding():
231 """Get preferred encoding.
232
233 Returns the best encoding scheme for the system, based on
234 locale.getpreferredencoding() and some further tweaks.
235 """
236 try:
237 pref = locale.getpreferredencoding()
238 'TEST'.encode(pref)
239 except Exception:
240 pref = 'UTF-8'
241
242 return pref
243
244
245 def write_json_file(obj, fn):
246 """ Encode obj as JSON and write it to fn, atomically if possible """
247
248 tf = tempfile.NamedTemporaryFile(
249 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
250 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
251
252 try:
253 with tf:
254 json.dump(obj, tf, ensure_ascii=False)
255 if sys.platform == 'win32':
256 # Need to remove existing file on Windows, else os.rename raises
257 # WindowsError or FileExistsError.
258 with contextlib.suppress(OSError):
259 os.unlink(fn)
260 with contextlib.suppress(OSError):
261 mask = os.umask(0)
262 os.umask(mask)
263 os.chmod(tf.name, 0o666 & ~mask)
264 os.rename(tf.name, fn)
265 except Exception:
266 with contextlib.suppress(OSError):
267 os.remove(tf.name)
268 raise
269
270
271 def find_xpath_attr(node, xpath, key, val=None):
272 """ Find the xpath xpath[@key=val] """
273 assert re.match(r'^[a-zA-Z_-]+$', key)
274 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
275 return node.find(expr)
276
277 # On python2.6 the xml.etree.ElementTree.Element methods don't support
278 # the namespace parameter
279
280
281 def xpath_with_ns(path, ns_map):
282 components = [c.split(':') for c in path.split('/')]
283 replaced = []
284 for c in components:
285 if len(c) == 1:
286 replaced.append(c[0])
287 else:
288 ns, tag = c
289 replaced.append('{%s}%s' % (ns_map[ns], tag))
290 return '/'.join(replaced)
291
292
293 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
294 def _find_xpath(xpath):
295 return node.find(xpath)
296
297 if isinstance(xpath, str):
298 n = _find_xpath(xpath)
299 else:
300 for xp in xpath:
301 n = _find_xpath(xp)
302 if n is not None:
303 break
304
305 if n is None:
306 if default is not NO_DEFAULT:
307 return default
308 elif fatal:
309 name = xpath if name is None else name
310 raise ExtractorError('Could not find XML element %s' % name)
311 else:
312 return None
313 return n
314
315
316 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
317 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
318 if n is None or n == default:
319 return n
320 if n.text is None:
321 if default is not NO_DEFAULT:
322 return default
323 elif fatal:
324 name = xpath if name is None else name
325 raise ExtractorError('Could not find XML element\'s text %s' % name)
326 else:
327 return None
328 return n.text
329
330
331 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
332 n = find_xpath_attr(node, xpath, key)
333 if n is None:
334 if default is not NO_DEFAULT:
335 return default
336 elif fatal:
337 name = f'{xpath}[@{key}]' if name is None else name
338 raise ExtractorError('Could not find XML attribute %s' % name)
339 else:
340 return None
341 return n.attrib[key]
342
343
344 def get_element_by_id(id, html, **kwargs):
345 """Return the content of the tag with the specified ID in the passed HTML document"""
346 return get_element_by_attribute('id', id, html, **kwargs)
347
348
349 def get_element_html_by_id(id, html, **kwargs):
350 """Return the html of the tag with the specified ID in the passed HTML document"""
351 return get_element_html_by_attribute('id', id, html, **kwargs)
352
353
354 def get_element_by_class(class_name, html):
355 """Return the content of the first tag with the specified class in the passed HTML document"""
356 retval = get_elements_by_class(class_name, html)
357 return retval[0] if retval else None
358
359
360 def get_element_html_by_class(class_name, html):
361 """Return the html of the first tag with the specified class in the passed HTML document"""
362 retval = get_elements_html_by_class(class_name, html)
363 return retval[0] if retval else None
364
365
366 def get_element_by_attribute(attribute, value, html, **kwargs):
367 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
368 return retval[0] if retval else None
369
370
371 def get_element_html_by_attribute(attribute, value, html, **kargs):
372 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
373 return retval[0] if retval else None
374
375
376 def get_elements_by_class(class_name, html, **kargs):
377 """Return the content of all tags with the specified class in the passed HTML document as a list"""
378 return get_elements_by_attribute(
379 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
380 html, escape_value=False)
381
382
383 def get_elements_html_by_class(class_name, html):
384 """Return the html of all tags with the specified class in the passed HTML document as a list"""
385 return get_elements_html_by_attribute(
386 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
387 html, escape_value=False)
388
389
390 def get_elements_by_attribute(*args, **kwargs):
391 """Return the content of the tag with the specified attribute in the passed HTML document"""
392 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
393
394
395 def get_elements_html_by_attribute(*args, **kwargs):
396 """Return the html of the tag with the specified attribute in the passed HTML document"""
397 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
398
399
400 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
401 """
402 Return the text (content) and the html (whole) of the tag with the specified
403 attribute in the passed HTML document
404 """
405
406 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
407
408 value = re.escape(value) if escape_value else value
409
410 partial_element_re = rf'''(?x)
411 <(?P<tag>[a-zA-Z0-9:._-]+)
412 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
413 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
414 '''
415
416 for m in re.finditer(partial_element_re, html):
417 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
418
419 yield (
420 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
421 whole
422 )
423
424
425 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
426 """
427 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
428 closing tag for the first opening tag it has encountered, and can be used
429 as a context manager
430 """
431
432 class HTMLBreakOnClosingTagException(Exception):
433 pass
434
435 def __init__(self):
436 self.tagstack = collections.deque()
437 html.parser.HTMLParser.__init__(self)
438
439 def __enter__(self):
440 return self
441
442 def __exit__(self, *_):
443 self.close()
444
445 def close(self):
446 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
447 # so data remains buffered; we no longer have any interest in it, thus
448 # override this method to discard it
449 pass
450
451 def handle_starttag(self, tag, _):
452 self.tagstack.append(tag)
453
454 def handle_endtag(self, tag):
455 if not self.tagstack:
456 raise compat_HTMLParseError('no tags in the stack')
457 while self.tagstack:
458 inner_tag = self.tagstack.pop()
459 if inner_tag == tag:
460 break
461 else:
462 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
463 if not self.tagstack:
464 raise self.HTMLBreakOnClosingTagException()
465
466
467 def get_element_text_and_html_by_tag(tag, html):
468 """
469 For the first element with the specified tag in the passed HTML document
470 return its' content (text) and the whole element (html)
471 """
472 def find_or_raise(haystack, needle, exc):
473 try:
474 return haystack.index(needle)
475 except ValueError:
476 raise exc
477 closing_tag = f'</{tag}>'
478 whole_start = find_or_raise(
479 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
480 content_start = find_or_raise(
481 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
482 content_start += whole_start + 1
483 with HTMLBreakOnClosingTagParser() as parser:
484 parser.feed(html[whole_start:content_start])
485 if not parser.tagstack or parser.tagstack[0] != tag:
486 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
487 offset = content_start
488 while offset < len(html):
489 next_closing_tag_start = find_or_raise(
490 html[offset:], closing_tag,
491 compat_HTMLParseError(f'closing {tag} tag not found'))
492 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
493 try:
494 parser.feed(html[offset:offset + next_closing_tag_end])
495 offset += next_closing_tag_end
496 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
497 return html[content_start:offset + next_closing_tag_start], \
498 html[whole_start:offset + next_closing_tag_end]
499 raise compat_HTMLParseError('unexpected end of html')
500
501
502 class HTMLAttributeParser(html.parser.HTMLParser):
503 """Trivial HTML parser to gather the attributes for a single element"""
504
505 def __init__(self):
506 self.attrs = {}
507 html.parser.HTMLParser.__init__(self)
508
509 def handle_starttag(self, tag, attrs):
510 self.attrs = dict(attrs)
511
512
513 class HTMLListAttrsParser(html.parser.HTMLParser):
514 """HTML parser to gather the attributes for the elements of a list"""
515
516 def __init__(self):
517 html.parser.HTMLParser.__init__(self)
518 self.items = []
519 self._level = 0
520
521 def handle_starttag(self, tag, attrs):
522 if tag == 'li' and self._level == 0:
523 self.items.append(dict(attrs))
524 self._level += 1
525
526 def handle_endtag(self, tag):
527 self._level -= 1
528
529
530 def extract_attributes(html_element):
531 """Given a string for an HTML element such as
532 <el
533 a="foo" B="bar" c="&98;az" d=boz
534 empty= noval entity="&amp;"
535 sq='"' dq="'"
536 >
537 Decode and return a dictionary of attributes.
538 {
539 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
540 'empty': '', 'noval': None, 'entity': '&',
541 'sq': '"', 'dq': '\''
542 }.
543 """
544 parser = HTMLAttributeParser()
545 with contextlib.suppress(compat_HTMLParseError):
546 parser.feed(html_element)
547 parser.close()
548 return parser.attrs
549
550
551 def parse_list(webpage):
552 """Given a string for an series of HTML <li> elements,
553 return a dictionary of their attributes"""
554 parser = HTMLListAttrsParser()
555 parser.feed(webpage)
556 parser.close()
557 return parser.items
558
559
560 def clean_html(html):
561 """Clean an HTML snippet into a readable string"""
562
563 if html is None: # Convenience for sanitizing descriptions etc.
564 return html
565
566 html = re.sub(r'\s+', ' ', html)
567 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
568 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
569 # Strip html tags
570 html = re.sub('<.*?>', '', html)
571 # Replace html entities
572 html = unescapeHTML(html)
573 return html.strip()
574
575
576 class LenientJSONDecoder(json.JSONDecoder):
577 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
578 self.transform_source, self.ignore_extra = transform_source, ignore_extra
579 super().__init__(*args, **kwargs)
580
581 def decode(self, s):
582 if self.transform_source:
583 s = self.transform_source(s)
584 if self.ignore_extra:
585 return self.raw_decode(s.lstrip())[0]
586 return super().decode(s)
587
588
589 def sanitize_open(filename, open_mode):
590 """Try to open the given filename, and slightly tweak it if this fails.
591
592 Attempts to open the given filename. If this fails, it tries to change
593 the filename slightly, step by step, until it's either able to open it
594 or it fails and raises a final exception, like the standard open()
595 function.
596
597 It returns the tuple (stream, definitive_file_name).
598 """
599 if filename == '-':
600 if sys.platform == 'win32':
601 import msvcrt
602
603 # stdout may be any IO stream. Eg, when using contextlib.redirect_stdout
604 with contextlib.suppress(io.UnsupportedOperation):
605 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
606 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
607
608 for attempt in range(2):
609 try:
610 try:
611 if sys.platform == 'win32':
612 # FIXME: An exclusive lock also locks the file from being read.
613 # Since windows locks are mandatory, don't lock the file on windows (for now).
614 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
615 raise LockingUnsupportedError()
616 stream = locked_file(filename, open_mode, block=False).__enter__()
617 except OSError:
618 stream = open(filename, open_mode)
619 return stream, filename
620 except OSError as err:
621 if attempt or err.errno in (errno.EACCES,):
622 raise
623 old_filename, filename = filename, sanitize_path(filename)
624 if old_filename == filename:
625 raise
626
627
628 def timeconvert(timestr):
629 """Convert RFC 2822 defined time string into system timestamp"""
630 timestamp = None
631 timetuple = email.utils.parsedate_tz(timestr)
632 if timetuple is not None:
633 timestamp = email.utils.mktime_tz(timetuple)
634 return timestamp
635
636
637 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
638 """Sanitizes a string so it could be used as part of a filename.
639 @param restricted Use a stricter subset of allowed characters
640 @param is_id Whether this is an ID that should be kept unchanged if possible.
641 If unset, yt-dlp's new sanitization rules are in effect
642 """
643 if s == '':
644 return ''
645
646 def replace_insane(char):
647 if restricted and char in ACCENT_CHARS:
648 return ACCENT_CHARS[char]
649 elif not restricted and char == '\n':
650 return '\0 '
651 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
652 # Replace with their full-width unicode counterparts
653 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
654 elif char == '?' or ord(char) < 32 or ord(char) == 127:
655 return ''
656 elif char == '"':
657 return '' if restricted else '\''
658 elif char == ':':
659 return '\0_\0-' if restricted else '\0 \0-'
660 elif char in '\\/|*<>':
661 return '\0_'
662 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
663 return '\0_'
664 return char
665
666 if restricted and is_id is NO_DEFAULT:
667 s = unicodedata.normalize('NFKC', s)
668 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
669 result = ''.join(map(replace_insane, s))
670 if is_id is NO_DEFAULT:
671 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
672 STRIP_RE = r'(?:\0.|[ _-])*'
673 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
674 result = result.replace('\0', '') or '_'
675
676 if not is_id:
677 while '__' in result:
678 result = result.replace('__', '_')
679 result = result.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted and result.startswith('-_'):
682 result = result[2:]
683 if result.startswith('-'):
684 result = '_' + result[len('-'):]
685 result = result.lstrip('.')
686 if not result:
687 result = '_'
688 return result
689
690
691 def sanitize_path(s, force=False):
692 """Sanitizes and normalizes path on Windows"""
693 if sys.platform == 'win32':
694 force = False
695 drive_or_unc, _ = os.path.splitdrive(s)
696 elif force:
697 drive_or_unc = ''
698 else:
699 return s
700
701 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
702 if drive_or_unc:
703 norm_path.pop(0)
704 sanitized_path = [
705 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
706 for path_part in norm_path]
707 if drive_or_unc:
708 sanitized_path.insert(0, drive_or_unc + os.path.sep)
709 elif force and s and s[0] == os.path.sep:
710 sanitized_path.insert(0, os.path.sep)
711 return os.path.join(*sanitized_path)
712
713
714 def sanitize_url(url, *, scheme='http'):
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
717 if url is None:
718 return
719 elif url.startswith('//'):
720 return f'{scheme}:{url}'
721 # Fix some common typos seen so far
722 COMMON_TYPOS = (
723 # https://github.com/ytdl-org/youtube-dl/issues/15649
724 (r'^httpss://', r'https://'),
725 # https://bx1.be/lives/direct-tv/
726 (r'^rmtp([es]?)://', r'rtmp\1://'),
727 )
728 for mistake, fixup in COMMON_TYPOS:
729 if re.match(mistake, url):
730 return re.sub(mistake, fixup, url)
731 return url
732
733
734 def extract_basic_auth(url):
735 parts = urllib.parse.urlsplit(url)
736 if parts.username is None:
737 return url, None
738 url = urllib.parse.urlunsplit(parts._replace(netloc=(
739 parts.hostname if parts.port is None
740 else '%s:%d' % (parts.hostname, parts.port))))
741 auth_payload = base64.b64encode(
742 ('%s:%s' % (parts.username, parts.password or '')).encode())
743 return url, f'Basic {auth_payload.decode()}'
744
745
746 def sanitized_Request(url, *args, **kwargs):
747 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
748 if auth_header is not None:
749 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
750 headers['Authorization'] = auth_header
751 return urllib.request.Request(url, *args, **kwargs)
752
753
754 def expand_path(s):
755 """Expand shell variables and ~"""
756 return os.path.expandvars(compat_expanduser(s))
757
758
759 def orderedSet(iterable, *, lazy=False):
760 """Remove all duplicates from the input iterable"""
761 def _iter():
762 seen = [] # Do not use set since the items can be unhashable
763 for x in iterable:
764 if x not in seen:
765 seen.append(x)
766 yield x
767
768 return _iter() if lazy else list(_iter())
769
770
771 def _htmlentity_transform(entity_with_semicolon):
772 """Transforms an HTML entity to a character."""
773 entity = entity_with_semicolon[:-1]
774
775 # Known non-numeric HTML entity
776 if entity in html.entities.name2codepoint:
777 return chr(html.entities.name2codepoint[entity])
778
779 # TODO: HTML5 allows entities without a semicolon. For example,
780 # '&Eacuteric' should be decoded as 'Éric'.
781 if entity_with_semicolon in html.entities.html5:
782 return html.entities.html5[entity_with_semicolon]
783
784 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
785 if mobj is not None:
786 numstr = mobj.group(1)
787 if numstr.startswith('x'):
788 base = 16
789 numstr = '0%s' % numstr
790 else:
791 base = 10
792 # See https://github.com/ytdl-org/youtube-dl/issues/7518
793 with contextlib.suppress(ValueError):
794 return chr(int(numstr, base))
795
796 # Unknown entity in name, return its literal representation
797 return '&%s;' % entity
798
799
800 def unescapeHTML(s):
801 if s is None:
802 return None
803 assert isinstance(s, str)
804
805 return re.sub(
806 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
807
808
809 def escapeHTML(text):
810 return (
811 text
812 .replace('&', '&amp;')
813 .replace('<', '&lt;')
814 .replace('>', '&gt;')
815 .replace('"', '&quot;')
816 .replace("'", '&#39;')
817 )
818
819
820 def process_communicate_or_kill(p, *args, **kwargs):
821 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
822 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
823 return Popen.communicate_or_kill(p, *args, **kwargs)
824
825
826 class Popen(subprocess.Popen):
827 if sys.platform == 'win32':
828 _startupinfo = subprocess.STARTUPINFO()
829 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
830 else:
831 _startupinfo = None
832
833 def __init__(self, *args, text=False, **kwargs):
834 if text is True:
835 kwargs['universal_newlines'] = True # For 3.6 compatibility
836 kwargs.setdefault('encoding', 'utf-8')
837 kwargs.setdefault('errors', 'replace')
838 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
839
840 def communicate_or_kill(self, *args, **kwargs):
841 try:
842 return self.communicate(*args, **kwargs)
843 except BaseException: # Including KeyboardInterrupt
844 self.kill(timeout=None)
845 raise
846
847 def kill(self, *, timeout=0):
848 super().kill()
849 if timeout != 0:
850 self.wait(timeout=timeout)
851
852 @classmethod
853 def run(cls, *args, **kwargs):
854 with cls(*args, **kwargs) as proc:
855 stdout, stderr = proc.communicate_or_kill()
856 return stdout or '', stderr or '', proc.returncode
857
858
859 def get_subprocess_encoding():
860 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
861 # For subprocess calls, encode with locale encoding
862 # Refer to http://stackoverflow.com/a/9951851/35070
863 encoding = preferredencoding()
864 else:
865 encoding = sys.getfilesystemencoding()
866 if encoding is None:
867 encoding = 'utf-8'
868 return encoding
869
870
871 def encodeFilename(s, for_subprocess=False):
872 assert isinstance(s, str)
873 return s
874
875
876 def decodeFilename(b, for_subprocess=False):
877 return b
878
879
880 def encodeArgument(s):
881 # Legacy code that uses byte strings
882 # Uncomment the following line after fixing all post processors
883 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
884 return s if isinstance(s, str) else s.decode('ascii')
885
886
887 def decodeArgument(b):
888 return b
889
890
891 def decodeOption(optval):
892 if optval is None:
893 return optval
894 if isinstance(optval, bytes):
895 optval = optval.decode(preferredencoding())
896
897 assert isinstance(optval, str)
898 return optval
899
900
901 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
902
903
904 def timetuple_from_msec(msec):
905 secs, msec = divmod(msec, 1000)
906 mins, secs = divmod(secs, 60)
907 hrs, mins = divmod(mins, 60)
908 return _timetuple(hrs, mins, secs, msec)
909
910
911 def formatSeconds(secs, delim=':', msec=False):
912 time = timetuple_from_msec(secs * 1000)
913 if time.hours:
914 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
915 elif time.minutes:
916 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
917 else:
918 ret = '%d' % time.seconds
919 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
920
921
922 def _ssl_load_windows_store_certs(ssl_context, storename):
923 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
924 try:
925 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
926 if encoding == 'x509_asn' and (
927 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
928 except PermissionError:
929 return
930 for cert in certs:
931 with contextlib.suppress(ssl.SSLError):
932 ssl_context.load_verify_locations(cadata=cert)
933
934
935 def make_HTTPS_handler(params, **kwargs):
936 opts_check_certificate = not params.get('nocheckcertificate')
937 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
938 context.check_hostname = opts_check_certificate
939 if params.get('legacyserverconnect'):
940 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
941 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
942 context.set_ciphers('DEFAULT')
943
944 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
945 if opts_check_certificate:
946 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
947 context.load_verify_locations(cafile=certifi.where())
948 else:
949 try:
950 context.load_default_certs()
951 # Work around the issue in load_default_certs when there are bad certificates. See:
952 # https://github.com/yt-dlp/yt-dlp/issues/1060,
953 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
954 except ssl.SSLError:
955 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
956 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
957 for storename in ('CA', 'ROOT'):
958 _ssl_load_windows_store_certs(context, storename)
959 context.set_default_verify_paths()
960
961 client_certfile = params.get('client_certificate')
962 if client_certfile:
963 try:
964 context.load_cert_chain(
965 client_certfile, keyfile=params.get('client_certificate_key'),
966 password=params.get('client_certificate_password'))
967 except ssl.SSLError:
968 raise YoutubeDLError('Unable to load client certificate')
969
970 # Some servers may reject requests if ALPN extension is not sent. See:
971 # https://github.com/python/cpython/issues/85140
972 # https://github.com/yt-dlp/yt-dlp/issues/3878
973 with contextlib.suppress(NotImplementedError):
974 context.set_alpn_protocols(['http/1.1'])
975
976 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
977
978
979 def bug_reports_message(before=';'):
980 from .update import REPOSITORY
981
982 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
983 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
984
985 before = before.rstrip()
986 if not before or before.endswith(('.', '!', '?')):
987 msg = msg[0].title() + msg[1:]
988
989 return (before + ' ' if before else '') + msg
990
991
992 class YoutubeDLError(Exception):
993 """Base exception for YoutubeDL errors."""
994 msg = None
995
996 def __init__(self, msg=None):
997 if msg is not None:
998 self.msg = msg
999 elif self.msg is None:
1000 self.msg = type(self).__name__
1001 super().__init__(self.msg)
1002
1003
1004 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1005 if hasattr(ssl, 'CertificateError'):
1006 network_exceptions.append(ssl.CertificateError)
1007 network_exceptions = tuple(network_exceptions)
1008
1009
1010 class ExtractorError(YoutubeDLError):
1011 """Error during info extraction."""
1012
1013 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1014 """ tb, if given, is the original traceback (so that it can be printed out).
1015 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1016 """
1017 if sys.exc_info()[0] in network_exceptions:
1018 expected = True
1019
1020 self.orig_msg = str(msg)
1021 self.traceback = tb
1022 self.expected = expected
1023 self.cause = cause
1024 self.video_id = video_id
1025 self.ie = ie
1026 self.exc_info = sys.exc_info() # preserve original exception
1027 if isinstance(self.exc_info[1], ExtractorError):
1028 self.exc_info = self.exc_info[1].exc_info
1029
1030 super().__init__(''.join((
1031 format_field(ie, None, '[%s] '),
1032 format_field(video_id, None, '%s: '),
1033 msg,
1034 format_field(cause, None, ' (caused by %r)'),
1035 '' if expected else bug_reports_message())))
1036
1037 def format_traceback(self):
1038 return join_nonempty(
1039 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1040 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1041 delim='\n') or None
1042
1043
1044 class UnsupportedError(ExtractorError):
1045 def __init__(self, url):
1046 super().__init__(
1047 'Unsupported URL: %s' % url, expected=True)
1048 self.url = url
1049
1050
1051 class RegexNotFoundError(ExtractorError):
1052 """Error when a regex didn't match"""
1053 pass
1054
1055
1056 class GeoRestrictedError(ExtractorError):
1057 """Geographic restriction Error exception.
1058
1059 This exception may be thrown when a video is not available from your
1060 geographic location due to geographic restrictions imposed by a website.
1061 """
1062
1063 def __init__(self, msg, countries=None, **kwargs):
1064 kwargs['expected'] = True
1065 super().__init__(msg, **kwargs)
1066 self.countries = countries
1067
1068
1069 class UserNotLive(ExtractorError):
1070 """Error when a channel/user is not live"""
1071
1072 def __init__(self, msg=None, **kwargs):
1073 kwargs['expected'] = True
1074 super().__init__(msg or 'The channel is not currently live', **kwargs)
1075
1076
1077 class DownloadError(YoutubeDLError):
1078 """Download Error exception.
1079
1080 This exception may be thrown by FileDownloader objects if they are not
1081 configured to continue on errors. They will contain the appropriate
1082 error message.
1083 """
1084
1085 def __init__(self, msg, exc_info=None):
1086 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1087 super().__init__(msg)
1088 self.exc_info = exc_info
1089
1090
1091 class EntryNotInPlaylist(YoutubeDLError):
1092 """Entry not in playlist exception.
1093
1094 This exception will be thrown by YoutubeDL when a requested entry
1095 is not found in the playlist info_dict
1096 """
1097 msg = 'Entry not found in info'
1098
1099
1100 class SameFileError(YoutubeDLError):
1101 """Same File exception.
1102
1103 This exception will be thrown by FileDownloader objects if they detect
1104 multiple files would have to be downloaded to the same file on disk.
1105 """
1106 msg = 'Fixed output name but more than one file to download'
1107
1108 def __init__(self, filename=None):
1109 if filename is not None:
1110 self.msg += f': {filename}'
1111 super().__init__(self.msg)
1112
1113
1114 class PostProcessingError(YoutubeDLError):
1115 """Post Processing exception.
1116
1117 This exception may be raised by PostProcessor's .run() method to
1118 indicate an error in the postprocessing task.
1119 """
1120
1121
1122 class DownloadCancelled(YoutubeDLError):
1123 """ Exception raised when the download queue should be interrupted """
1124 msg = 'The download was cancelled'
1125
1126
1127 class ExistingVideoReached(DownloadCancelled):
1128 """ --break-on-existing triggered """
1129 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1130
1131
1132 class RejectedVideoReached(DownloadCancelled):
1133 """ --break-on-reject triggered """
1134 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1135
1136
1137 class MaxDownloadsReached(DownloadCancelled):
1138 """ --max-downloads limit has been reached. """
1139 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1140
1141
1142 class ReExtractInfo(YoutubeDLError):
1143 """ Video info needs to be re-extracted. """
1144
1145 def __init__(self, msg, expected=False):
1146 super().__init__(msg)
1147 self.expected = expected
1148
1149
1150 class ThrottledDownload(ReExtractInfo):
1151 """ Download speed below --throttled-rate. """
1152 msg = 'The download speed is below throttle limit'
1153
1154 def __init__(self):
1155 super().__init__(self.msg, expected=False)
1156
1157
1158 class UnavailableVideoError(YoutubeDLError):
1159 """Unavailable Format exception.
1160
1161 This exception will be thrown when a video is requested
1162 in a format that is not available for that video.
1163 """
1164 msg = 'Unable to download video'
1165
1166 def __init__(self, err=None):
1167 if err is not None:
1168 self.msg += f': {err}'
1169 super().__init__(self.msg)
1170
1171
1172 class ContentTooShortError(YoutubeDLError):
1173 """Content Too Short exception.
1174
1175 This exception may be raised by FileDownloader objects when a file they
1176 download is too small for what the server announced first, indicating
1177 the connection was probably interrupted.
1178 """
1179
1180 def __init__(self, downloaded, expected):
1181 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1182 # Both in bytes
1183 self.downloaded = downloaded
1184 self.expected = expected
1185
1186
1187 class XAttrMetadataError(YoutubeDLError):
1188 def __init__(self, code=None, msg='Unknown error'):
1189 super().__init__(msg)
1190 self.code = code
1191 self.msg = msg
1192
1193 # Parsing code and msg
1194 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1195 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1196 self.reason = 'NO_SPACE'
1197 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1198 self.reason = 'VALUE_TOO_LONG'
1199 else:
1200 self.reason = 'NOT_SUPPORTED'
1201
1202
1203 class XAttrUnavailableError(YoutubeDLError):
1204 pass
1205
1206
1207 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1208 hc = http_class(*args, **kwargs)
1209 source_address = ydl_handler._params.get('source_address')
1210
1211 if source_address is not None:
1212 # This is to workaround _create_connection() from socket where it will try all
1213 # address data from getaddrinfo() including IPv6. This filters the result from
1214 # getaddrinfo() based on the source_address value.
1215 # This is based on the cpython socket.create_connection() function.
1216 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1217 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1218 host, port = address
1219 err = None
1220 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1221 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1222 ip_addrs = [addr for addr in addrs if addr[0] == af]
1223 if addrs and not ip_addrs:
1224 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1225 raise OSError(
1226 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1227 % (ip_version, source_address[0]))
1228 for res in ip_addrs:
1229 af, socktype, proto, canonname, sa = res
1230 sock = None
1231 try:
1232 sock = socket.socket(af, socktype, proto)
1233 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1234 sock.settimeout(timeout)
1235 sock.bind(source_address)
1236 sock.connect(sa)
1237 err = None # Explicitly break reference cycle
1238 return sock
1239 except OSError as _:
1240 err = _
1241 if sock is not None:
1242 sock.close()
1243 if err is not None:
1244 raise err
1245 else:
1246 raise OSError('getaddrinfo returns an empty list')
1247 if hasattr(hc, '_create_connection'):
1248 hc._create_connection = _create_connection
1249 hc.source_address = (source_address, 0)
1250
1251 return hc
1252
1253
1254 def handle_youtubedl_headers(headers):
1255 filtered_headers = headers
1256
1257 if 'Youtubedl-no-compression' in filtered_headers:
1258 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1259 del filtered_headers['Youtubedl-no-compression']
1260
1261 return filtered_headers
1262
1263
1264 class YoutubeDLHandler(urllib.request.HTTPHandler):
1265 """Handler for HTTP requests and responses.
1266
1267 This class, when installed with an OpenerDirector, automatically adds
1268 the standard headers to every HTTP request and handles gzipped and
1269 deflated responses from web servers. If compression is to be avoided in
1270 a particular request, the original request in the program code only has
1271 to include the HTTP header "Youtubedl-no-compression", which will be
1272 removed before making the real request.
1273
1274 Part of this code was copied from:
1275
1276 http://techknack.net/python-urllib2-handlers/
1277
1278 Andrew Rowls, the author of that code, agreed to release it to the
1279 public domain.
1280 """
1281
1282 def __init__(self, params, *args, **kwargs):
1283 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1284 self._params = params
1285
1286 def http_open(self, req):
1287 conn_class = http.client.HTTPConnection
1288
1289 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1290 if socks_proxy:
1291 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1292 del req.headers['Ytdl-socks-proxy']
1293
1294 return self.do_open(functools.partial(
1295 _create_http_connection, self, conn_class, False),
1296 req)
1297
1298 @staticmethod
1299 def deflate(data):
1300 if not data:
1301 return data
1302 try:
1303 return zlib.decompress(data, -zlib.MAX_WBITS)
1304 except zlib.error:
1305 return zlib.decompress(data)
1306
1307 @staticmethod
1308 def brotli(data):
1309 if not data:
1310 return data
1311 return brotli.decompress(data)
1312
1313 def http_request(self, req):
1314 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1315 # always respected by websites, some tend to give out URLs with non percent-encoded
1316 # non-ASCII characters (see telemb.py, ard.py [#3412])
1317 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1318 # To work around aforementioned issue we will replace request's original URL with
1319 # percent-encoded one
1320 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1321 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1322 url = req.get_full_url()
1323 url_escaped = escape_url(url)
1324
1325 # Substitute URL if any change after escaping
1326 if url != url_escaped:
1327 req = update_Request(req, url=url_escaped)
1328
1329 for h, v in self._params.get('http_headers', std_headers).items():
1330 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1331 # The dict keys are capitalized because of this bug by urllib
1332 if h.capitalize() not in req.headers:
1333 req.add_header(h, v)
1334
1335 if 'Accept-encoding' not in req.headers:
1336 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1337
1338 req.headers = handle_youtubedl_headers(req.headers)
1339
1340 return super().do_request_(req)
1341
1342 def http_response(self, req, resp):
1343 old_resp = resp
1344 # gzip
1345 if resp.headers.get('Content-encoding', '') == 'gzip':
1346 content = resp.read()
1347 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1348 try:
1349 uncompressed = io.BytesIO(gz.read())
1350 except OSError as original_ioerror:
1351 # There may be junk add the end of the file
1352 # See http://stackoverflow.com/q/4928560/35070 for details
1353 for i in range(1, 1024):
1354 try:
1355 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1356 uncompressed = io.BytesIO(gz.read())
1357 except OSError:
1358 continue
1359 break
1360 else:
1361 raise original_ioerror
1362 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1363 resp.msg = old_resp.msg
1364 del resp.headers['Content-encoding']
1365 # deflate
1366 if resp.headers.get('Content-encoding', '') == 'deflate':
1367 gz = io.BytesIO(self.deflate(resp.read()))
1368 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1369 resp.msg = old_resp.msg
1370 del resp.headers['Content-encoding']
1371 # brotli
1372 if resp.headers.get('Content-encoding', '') == 'br':
1373 resp = urllib.request.addinfourl(
1374 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1375 resp.msg = old_resp.msg
1376 del resp.headers['Content-encoding']
1377 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1378 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1379 if 300 <= resp.code < 400:
1380 location = resp.headers.get('Location')
1381 if location:
1382 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1383 location = location.encode('iso-8859-1').decode()
1384 location_escaped = escape_url(location)
1385 if location != location_escaped:
1386 del resp.headers['Location']
1387 resp.headers['Location'] = location_escaped
1388 return resp
1389
1390 https_request = http_request
1391 https_response = http_response
1392
1393
1394 def make_socks_conn_class(base_class, socks_proxy):
1395 assert issubclass(base_class, (
1396 http.client.HTTPConnection, http.client.HTTPSConnection))
1397
1398 url_components = urllib.parse.urlparse(socks_proxy)
1399 if url_components.scheme.lower() == 'socks5':
1400 socks_type = ProxyType.SOCKS5
1401 elif url_components.scheme.lower() in ('socks', 'socks4'):
1402 socks_type = ProxyType.SOCKS4
1403 elif url_components.scheme.lower() == 'socks4a':
1404 socks_type = ProxyType.SOCKS4A
1405
1406 def unquote_if_non_empty(s):
1407 if not s:
1408 return s
1409 return urllib.parse.unquote_plus(s)
1410
1411 proxy_args = (
1412 socks_type,
1413 url_components.hostname, url_components.port or 1080,
1414 True, # Remote DNS
1415 unquote_if_non_empty(url_components.username),
1416 unquote_if_non_empty(url_components.password),
1417 )
1418
1419 class SocksConnection(base_class):
1420 def connect(self):
1421 self.sock = sockssocket()
1422 self.sock.setproxy(*proxy_args)
1423 if isinstance(self.timeout, (int, float)):
1424 self.sock.settimeout(self.timeout)
1425 self.sock.connect((self.host, self.port))
1426
1427 if isinstance(self, http.client.HTTPSConnection):
1428 if hasattr(self, '_context'): # Python > 2.6
1429 self.sock = self._context.wrap_socket(
1430 self.sock, server_hostname=self.host)
1431 else:
1432 self.sock = ssl.wrap_socket(self.sock)
1433
1434 return SocksConnection
1435
1436
1437 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1438 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1439 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1440 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1441 self._params = params
1442
1443 def https_open(self, req):
1444 kwargs = {}
1445 conn_class = self._https_conn_class
1446
1447 if hasattr(self, '_context'): # python > 2.6
1448 kwargs['context'] = self._context
1449 if hasattr(self, '_check_hostname'): # python 3.x
1450 kwargs['check_hostname'] = self._check_hostname
1451
1452 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1453 if socks_proxy:
1454 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1455 del req.headers['Ytdl-socks-proxy']
1456
1457 try:
1458 return self.do_open(
1459 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1460 except urllib.error.URLError as e:
1461 if (isinstance(e.reason, ssl.SSLError)
1462 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1463 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1464 raise
1465
1466
1467 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1468 """
1469 See [1] for cookie file format.
1470
1471 1. https://curl.haxx.se/docs/http-cookies.html
1472 """
1473 _HTTPONLY_PREFIX = '#HttpOnly_'
1474 _ENTRY_LEN = 7
1475 _HEADER = '''# Netscape HTTP Cookie File
1476 # This file is generated by yt-dlp. Do not edit.
1477
1478 '''
1479 _CookieFileEntry = collections.namedtuple(
1480 'CookieFileEntry',
1481 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1482
1483 def __init__(self, filename=None, *args, **kwargs):
1484 super().__init__(None, *args, **kwargs)
1485 if self.is_path(filename):
1486 filename = os.fspath(filename)
1487 self.filename = filename
1488
1489 @staticmethod
1490 def _true_or_false(cndn):
1491 return 'TRUE' if cndn else 'FALSE'
1492
1493 @staticmethod
1494 def is_path(file):
1495 return isinstance(file, (str, bytes, os.PathLike))
1496
1497 @contextlib.contextmanager
1498 def open(self, file, *, write=False):
1499 if self.is_path(file):
1500 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1501 yield f
1502 else:
1503 if write:
1504 file.truncate(0)
1505 yield file
1506
1507 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1508 now = time.time()
1509 for cookie in self:
1510 if (not ignore_discard and cookie.discard
1511 or not ignore_expires and cookie.is_expired(now)):
1512 continue
1513 name, value = cookie.name, cookie.value
1514 if value is None:
1515 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1516 # with no name, whereas http.cookiejar regards it as a
1517 # cookie with no value.
1518 name, value = '', name
1519 f.write('%s\n' % '\t'.join((
1520 cookie.domain,
1521 self._true_or_false(cookie.domain.startswith('.')),
1522 cookie.path,
1523 self._true_or_false(cookie.secure),
1524 str_or_none(cookie.expires, default=''),
1525 name, value
1526 )))
1527
1528 def save(self, filename=None, *args, **kwargs):
1529 """
1530 Save cookies to a file.
1531 Code is taken from CPython 3.6
1532 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1533
1534 if filename is None:
1535 if self.filename is not None:
1536 filename = self.filename
1537 else:
1538 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1539
1540 # Store session cookies with `expires` set to 0 instead of an empty string
1541 for cookie in self:
1542 if cookie.expires is None:
1543 cookie.expires = 0
1544
1545 with self.open(filename, write=True) as f:
1546 f.write(self._HEADER)
1547 self._really_save(f, *args, **kwargs)
1548
1549 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1550 """Load cookies from a file."""
1551 if filename is None:
1552 if self.filename is not None:
1553 filename = self.filename
1554 else:
1555 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1556
1557 def prepare_line(line):
1558 if line.startswith(self._HTTPONLY_PREFIX):
1559 line = line[len(self._HTTPONLY_PREFIX):]
1560 # comments and empty lines are fine
1561 if line.startswith('#') or not line.strip():
1562 return line
1563 cookie_list = line.split('\t')
1564 if len(cookie_list) != self._ENTRY_LEN:
1565 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1566 cookie = self._CookieFileEntry(*cookie_list)
1567 if cookie.expires_at and not cookie.expires_at.isdigit():
1568 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1569 return line
1570
1571 cf = io.StringIO()
1572 with self.open(filename) as f:
1573 for line in f:
1574 try:
1575 cf.write(prepare_line(line))
1576 except http.cookiejar.LoadError as e:
1577 if f'{line.strip()} '[0] in '[{"':
1578 raise http.cookiejar.LoadError(
1579 'Cookies file must be Netscape formatted, not JSON. See '
1580 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1581 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1582 continue
1583 cf.seek(0)
1584 self._really_load(cf, filename, ignore_discard, ignore_expires)
1585 # Session cookies are denoted by either `expires` field set to
1586 # an empty string or 0. MozillaCookieJar only recognizes the former
1587 # (see [1]). So we need force the latter to be recognized as session
1588 # cookies on our own.
1589 # Session cookies may be important for cookies-based authentication,
1590 # e.g. usually, when user does not check 'Remember me' check box while
1591 # logging in on a site, some important cookies are stored as session
1592 # cookies so that not recognizing them will result in failed login.
1593 # 1. https://bugs.python.org/issue17164
1594 for cookie in self:
1595 # Treat `expires=0` cookies as session cookies
1596 if cookie.expires == 0:
1597 cookie.expires = None
1598 cookie.discard = True
1599
1600
1601 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1602 def __init__(self, cookiejar=None):
1603 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1604
1605 def http_response(self, request, response):
1606 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1607
1608 https_request = urllib.request.HTTPCookieProcessor.http_request
1609 https_response = http_response
1610
1611
1612 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1613 """YoutubeDL redirect handler
1614
1615 The code is based on HTTPRedirectHandler implementation from CPython [1].
1616
1617 This redirect handler solves two issues:
1618 - ensures redirect URL is always unicode under python 2
1619 - introduces support for experimental HTTP response status code
1620 308 Permanent Redirect [2] used by some sites [3]
1621
1622 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1623 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1624 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1625 """
1626
1627 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1628
1629 def redirect_request(self, req, fp, code, msg, headers, newurl):
1630 """Return a Request or None in response to a redirect.
1631
1632 This is called by the http_error_30x methods when a
1633 redirection response is received. If a redirection should
1634 take place, return a new Request to allow http_error_30x to
1635 perform the redirect. Otherwise, raise HTTPError if no-one
1636 else should try to handle this url. Return None if you can't
1637 but another Handler might.
1638 """
1639 m = req.get_method()
1640 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1641 or code in (301, 302, 303) and m == "POST")):
1642 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1643 # Strictly (according to RFC 2616), 301 or 302 in response to
1644 # a POST MUST NOT cause a redirection without confirmation
1645 # from the user (of urllib.request, in this case). In practice,
1646 # essentially all clients do redirect in this case, so we do
1647 # the same.
1648
1649 # Be conciliant with URIs containing a space. This is mainly
1650 # redundant with the more complete encoding done in http_error_302(),
1651 # but it is kept for compatibility with other callers.
1652 newurl = newurl.replace(' ', '%20')
1653
1654 CONTENT_HEADERS = ("content-length", "content-type")
1655 # NB: don't use dict comprehension for python 2.6 compatibility
1656 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1657
1658 # A 303 must either use GET or HEAD for subsequent request
1659 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1660 if code == 303 and m != 'HEAD':
1661 m = 'GET'
1662 # 301 and 302 redirects are commonly turned into a GET from a POST
1663 # for subsequent requests by browsers, so we'll do the same.
1664 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1665 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1666 if code in (301, 302) and m == 'POST':
1667 m = 'GET'
1668
1669 return urllib.request.Request(
1670 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1671 unverifiable=True, method=m)
1672
1673
1674 def extract_timezone(date_str):
1675 m = re.search(
1676 r'''(?x)
1677 ^.{8,}? # >=8 char non-TZ prefix, if present
1678 (?P<tz>Z| # just the UTC Z, or
1679 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1680 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1681 [ ]? # optional space
1682 (?P<sign>\+|-) # +/-
1683 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1684 $)
1685 ''', date_str)
1686 if not m:
1687 timezone = datetime.timedelta()
1688 else:
1689 date_str = date_str[:-len(m.group('tz'))]
1690 if not m.group('sign'):
1691 timezone = datetime.timedelta()
1692 else:
1693 sign = 1 if m.group('sign') == '+' else -1
1694 timezone = datetime.timedelta(
1695 hours=sign * int(m.group('hours')),
1696 minutes=sign * int(m.group('minutes')))
1697 return timezone, date_str
1698
1699
1700 def parse_iso8601(date_str, delimiter='T', timezone=None):
1701 """ Return a UNIX timestamp from the given date """
1702
1703 if date_str is None:
1704 return None
1705
1706 date_str = re.sub(r'\.[0-9]+', '', date_str)
1707
1708 if timezone is None:
1709 timezone, date_str = extract_timezone(date_str)
1710
1711 with contextlib.suppress(ValueError):
1712 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1713 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1714 return calendar.timegm(dt.timetuple())
1715
1716
1717 def date_formats(day_first=True):
1718 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1719
1720
1721 def unified_strdate(date_str, day_first=True):
1722 """Return a string with the date in the format YYYYMMDD"""
1723
1724 if date_str is None:
1725 return None
1726 upload_date = None
1727 # Replace commas
1728 date_str = date_str.replace(',', ' ')
1729 # Remove AM/PM + timezone
1730 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1731 _, date_str = extract_timezone(date_str)
1732
1733 for expression in date_formats(day_first):
1734 with contextlib.suppress(ValueError):
1735 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1736 if upload_date is None:
1737 timetuple = email.utils.parsedate_tz(date_str)
1738 if timetuple:
1739 with contextlib.suppress(ValueError):
1740 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1741 if upload_date is not None:
1742 return str(upload_date)
1743
1744
1745 def unified_timestamp(date_str, day_first=True):
1746 if date_str is None:
1747 return None
1748
1749 date_str = re.sub(r'[,|]', '', date_str)
1750
1751 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1752 timezone, date_str = extract_timezone(date_str)
1753
1754 # Remove AM/PM + timezone
1755 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1756
1757 # Remove unrecognized timezones from ISO 8601 alike timestamps
1758 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1759 if m:
1760 date_str = date_str[:-len(m.group('tz'))]
1761
1762 # Python only supports microseconds, so remove nanoseconds
1763 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1764 if m:
1765 date_str = m.group(1)
1766
1767 for expression in date_formats(day_first):
1768 with contextlib.suppress(ValueError):
1769 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1770 return calendar.timegm(dt.timetuple())
1771 timetuple = email.utils.parsedate_tz(date_str)
1772 if timetuple:
1773 return calendar.timegm(timetuple) + pm_delta * 3600
1774
1775
1776 def determine_ext(url, default_ext='unknown_video'):
1777 if url is None or '.' not in url:
1778 return default_ext
1779 guess = url.partition('?')[0].rpartition('.')[2]
1780 if re.match(r'^[A-Za-z0-9]+$', guess):
1781 return guess
1782 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1783 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1784 return guess.rstrip('/')
1785 else:
1786 return default_ext
1787
1788
1789 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1790 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1791
1792
1793 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1794 R"""
1795 Return a datetime object from a string.
1796 Supported format:
1797 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1798
1799 @param format strftime format of DATE
1800 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1801 auto: round to the unit provided in date_str (if applicable).
1802 """
1803 auto_precision = False
1804 if precision == 'auto':
1805 auto_precision = True
1806 precision = 'microsecond'
1807 today = datetime_round(datetime.datetime.utcnow(), precision)
1808 if date_str in ('now', 'today'):
1809 return today
1810 if date_str == 'yesterday':
1811 return today - datetime.timedelta(days=1)
1812 match = re.match(
1813 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1814 date_str)
1815 if match is not None:
1816 start_time = datetime_from_str(match.group('start'), precision, format)
1817 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1818 unit = match.group('unit')
1819 if unit == 'month' or unit == 'year':
1820 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1821 unit = 'day'
1822 else:
1823 if unit == 'week':
1824 unit = 'day'
1825 time *= 7
1826 delta = datetime.timedelta(**{unit + 's': time})
1827 new_date = start_time + delta
1828 if auto_precision:
1829 return datetime_round(new_date, unit)
1830 return new_date
1831
1832 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1833
1834
1835 def date_from_str(date_str, format='%Y%m%d', strict=False):
1836 R"""
1837 Return a date object from a string using datetime_from_str
1838
1839 @param strict Restrict allowed patterns to "YYYYMMDD" and
1840 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1841 """
1842 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1843 raise ValueError(f'Invalid date format "{date_str}"')
1844 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1845
1846
1847 def datetime_add_months(dt, months):
1848 """Increment/Decrement a datetime object by months."""
1849 month = dt.month + months - 1
1850 year = dt.year + month // 12
1851 month = month % 12 + 1
1852 day = min(dt.day, calendar.monthrange(year, month)[1])
1853 return dt.replace(year, month, day)
1854
1855
1856 def datetime_round(dt, precision='day'):
1857 """
1858 Round a datetime object's time to a specific precision
1859 """
1860 if precision == 'microsecond':
1861 return dt
1862
1863 unit_seconds = {
1864 'day': 86400,
1865 'hour': 3600,
1866 'minute': 60,
1867 'second': 1,
1868 }
1869 roundto = lambda x, n: ((x + n / 2) // n) * n
1870 timestamp = calendar.timegm(dt.timetuple())
1871 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1872
1873
1874 def hyphenate_date(date_str):
1875 """
1876 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1877 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1878 if match is not None:
1879 return '-'.join(match.groups())
1880 else:
1881 return date_str
1882
1883
1884 class DateRange:
1885 """Represents a time interval between two dates"""
1886
1887 def __init__(self, start=None, end=None):
1888 """start and end must be strings in the format accepted by date"""
1889 if start is not None:
1890 self.start = date_from_str(start, strict=True)
1891 else:
1892 self.start = datetime.datetime.min.date()
1893 if end is not None:
1894 self.end = date_from_str(end, strict=True)
1895 else:
1896 self.end = datetime.datetime.max.date()
1897 if self.start > self.end:
1898 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1899
1900 @classmethod
1901 def day(cls, day):
1902 """Returns a range that only contains the given day"""
1903 return cls(day, day)
1904
1905 def __contains__(self, date):
1906 """Check if the date is in the range"""
1907 if not isinstance(date, datetime.date):
1908 date = date_from_str(date)
1909 return self.start <= date <= self.end
1910
1911 def __str__(self):
1912 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1913
1914 def __eq__(self, other):
1915 return (isinstance(other, DateRange)
1916 and self.start == other.start and self.end == other.end)
1917
1918
1919 def platform_name():
1920 """ Returns the platform name as a str """
1921 write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1922 return platform.platform()
1923
1924
1925 @functools.cache
1926 def system_identifier():
1927 python_implementation = platform.python_implementation()
1928 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1929 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1930
1931 return 'Python %s (%s %s) - %s %s' % (
1932 platform.python_version(),
1933 python_implementation,
1934 platform.architecture()[0],
1935 platform.platform(),
1936 format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1937 )
1938
1939
1940 @functools.cache
1941 def get_windows_version():
1942 ''' Get Windows version. returns () if it's not running on Windows '''
1943 if compat_os_name == 'nt':
1944 return version_tuple(platform.win32_ver()[1])
1945 else:
1946 return ()
1947
1948
1949 def write_string(s, out=None, encoding=None):
1950 assert isinstance(s, str)
1951 out = out or sys.stderr
1952
1953 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1954 s = re.sub(r'([\r\n]+)', r' \1', s)
1955
1956 enc, buffer = None, out
1957 if 'b' in getattr(out, 'mode', ''):
1958 enc = encoding or preferredencoding()
1959 elif hasattr(out, 'buffer'):
1960 buffer = out.buffer
1961 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1962
1963 buffer.write(s.encode(enc, 'ignore') if enc else s)
1964 out.flush()
1965
1966
1967 def bytes_to_intlist(bs):
1968 if not bs:
1969 return []
1970 if isinstance(bs[0], int): # Python 3
1971 return list(bs)
1972 else:
1973 return [ord(c) for c in bs]
1974
1975
1976 def intlist_to_bytes(xs):
1977 if not xs:
1978 return b''
1979 return struct.pack('%dB' % len(xs), *xs)
1980
1981
1982 class LockingUnsupportedError(OSError):
1983 msg = 'File locking is not supported'
1984
1985 def __init__(self):
1986 super().__init__(self.msg)
1987
1988
1989 # Cross-platform file locking
1990 if sys.platform == 'win32':
1991 import ctypes
1992 import ctypes.wintypes
1993 import msvcrt
1994
1995 class OVERLAPPED(ctypes.Structure):
1996 _fields_ = [
1997 ('Internal', ctypes.wintypes.LPVOID),
1998 ('InternalHigh', ctypes.wintypes.LPVOID),
1999 ('Offset', ctypes.wintypes.DWORD),
2000 ('OffsetHigh', ctypes.wintypes.DWORD),
2001 ('hEvent', ctypes.wintypes.HANDLE),
2002 ]
2003
2004 kernel32 = ctypes.windll.kernel32
2005 LockFileEx = kernel32.LockFileEx
2006 LockFileEx.argtypes = [
2007 ctypes.wintypes.HANDLE, # hFile
2008 ctypes.wintypes.DWORD, # dwFlags
2009 ctypes.wintypes.DWORD, # dwReserved
2010 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2011 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2012 ctypes.POINTER(OVERLAPPED) # Overlapped
2013 ]
2014 LockFileEx.restype = ctypes.wintypes.BOOL
2015 UnlockFileEx = kernel32.UnlockFileEx
2016 UnlockFileEx.argtypes = [
2017 ctypes.wintypes.HANDLE, # hFile
2018 ctypes.wintypes.DWORD, # dwReserved
2019 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2020 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2021 ctypes.POINTER(OVERLAPPED) # Overlapped
2022 ]
2023 UnlockFileEx.restype = ctypes.wintypes.BOOL
2024 whole_low = 0xffffffff
2025 whole_high = 0x7fffffff
2026
2027 def _lock_file(f, exclusive, block):
2028 overlapped = OVERLAPPED()
2029 overlapped.Offset = 0
2030 overlapped.OffsetHigh = 0
2031 overlapped.hEvent = 0
2032 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2033
2034 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2035 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2036 0, whole_low, whole_high, f._lock_file_overlapped_p):
2037 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2038 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2039
2040 def _unlock_file(f):
2041 assert f._lock_file_overlapped_p
2042 handle = msvcrt.get_osfhandle(f.fileno())
2043 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2044 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2045
2046 else:
2047 try:
2048 import fcntl
2049
2050 def _lock_file(f, exclusive, block):
2051 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2052 if not block:
2053 flags |= fcntl.LOCK_NB
2054 try:
2055 fcntl.flock(f, flags)
2056 except BlockingIOError:
2057 raise
2058 except OSError: # AOSP does not have flock()
2059 fcntl.lockf(f, flags)
2060
2061 def _unlock_file(f):
2062 try:
2063 fcntl.flock(f, fcntl.LOCK_UN)
2064 except OSError:
2065 fcntl.lockf(f, fcntl.LOCK_UN)
2066
2067 except ImportError:
2068
2069 def _lock_file(f, exclusive, block):
2070 raise LockingUnsupportedError()
2071
2072 def _unlock_file(f):
2073 raise LockingUnsupportedError()
2074
2075
2076 class locked_file:
2077 locked = False
2078
2079 def __init__(self, filename, mode, block=True, encoding=None):
2080 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2081 raise NotImplementedError(mode)
2082 self.mode, self.block = mode, block
2083
2084 writable = any(f in mode for f in 'wax+')
2085 readable = any(f in mode for f in 'r+')
2086 flags = functools.reduce(operator.ior, (
2087 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2088 getattr(os, 'O_BINARY', 0), # Windows only
2089 getattr(os, 'O_NOINHERIT', 0), # Windows only
2090 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2091 os.O_APPEND if 'a' in mode else 0,
2092 os.O_EXCL if 'x' in mode else 0,
2093 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2094 ))
2095
2096 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2097
2098 def __enter__(self):
2099 exclusive = 'r' not in self.mode
2100 try:
2101 _lock_file(self.f, exclusive, self.block)
2102 self.locked = True
2103 except OSError:
2104 self.f.close()
2105 raise
2106 if 'w' in self.mode:
2107 try:
2108 self.f.truncate()
2109 except OSError as e:
2110 if e.errno not in (
2111 errno.ESPIPE, # Illegal seek - expected for FIFO
2112 errno.EINVAL, # Invalid argument - expected for /dev/null
2113 ):
2114 raise
2115 return self
2116
2117 def unlock(self):
2118 if not self.locked:
2119 return
2120 try:
2121 _unlock_file(self.f)
2122 finally:
2123 self.locked = False
2124
2125 def __exit__(self, *_):
2126 try:
2127 self.unlock()
2128 finally:
2129 self.f.close()
2130
2131 open = __enter__
2132 close = __exit__
2133
2134 def __getattr__(self, attr):
2135 return getattr(self.f, attr)
2136
2137 def __iter__(self):
2138 return iter(self.f)
2139
2140
2141 @functools.cache
2142 def get_filesystem_encoding():
2143 encoding = sys.getfilesystemencoding()
2144 return encoding if encoding is not None else 'utf-8'
2145
2146
2147 def shell_quote(args):
2148 quoted_args = []
2149 encoding = get_filesystem_encoding()
2150 for a in args:
2151 if isinstance(a, bytes):
2152 # We may get a filename encoded with 'encodeFilename'
2153 a = a.decode(encoding)
2154 quoted_args.append(compat_shlex_quote(a))
2155 return ' '.join(quoted_args)
2156
2157
2158 def smuggle_url(url, data):
2159 """ Pass additional data in a URL for internal use. """
2160
2161 url, idata = unsmuggle_url(url, {})
2162 data.update(idata)
2163 sdata = urllib.parse.urlencode(
2164 {'__youtubedl_smuggle': json.dumps(data)})
2165 return url + '#' + sdata
2166
2167
2168 def unsmuggle_url(smug_url, default=None):
2169 if '#__youtubedl_smuggle' not in smug_url:
2170 return smug_url, default
2171 url, _, sdata = smug_url.rpartition('#')
2172 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2173 data = json.loads(jsond)
2174 return url, data
2175
2176
2177 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2178 """ Formats numbers with decimal sufixes like K, M, etc """
2179 num, factor = float_or_none(num), float(factor)
2180 if num is None or num < 0:
2181 return None
2182 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2183 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2184 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2185 if factor == 1024:
2186 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2187 converted = num / (factor ** exponent)
2188 return fmt % (converted, suffix)
2189
2190
2191 def format_bytes(bytes):
2192 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2193
2194
2195 def lookup_unit_table(unit_table, s):
2196 units_re = '|'.join(re.escape(u) for u in unit_table)
2197 m = re.match(
2198 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2199 if not m:
2200 return None
2201 num_str = m.group('num').replace(',', '.')
2202 mult = unit_table[m.group('unit')]
2203 return int(float(num_str) * mult)
2204
2205
2206 def parse_filesize(s):
2207 if s is None:
2208 return None
2209
2210 # The lower-case forms are of course incorrect and unofficial,
2211 # but we support those too
2212 _UNIT_TABLE = {
2213 'B': 1,
2214 'b': 1,
2215 'bytes': 1,
2216 'KiB': 1024,
2217 'KB': 1000,
2218 'kB': 1024,
2219 'Kb': 1000,
2220 'kb': 1000,
2221 'kilobytes': 1000,
2222 'kibibytes': 1024,
2223 'MiB': 1024 ** 2,
2224 'MB': 1000 ** 2,
2225 'mB': 1024 ** 2,
2226 'Mb': 1000 ** 2,
2227 'mb': 1000 ** 2,
2228 'megabytes': 1000 ** 2,
2229 'mebibytes': 1024 ** 2,
2230 'GiB': 1024 ** 3,
2231 'GB': 1000 ** 3,
2232 'gB': 1024 ** 3,
2233 'Gb': 1000 ** 3,
2234 'gb': 1000 ** 3,
2235 'gigabytes': 1000 ** 3,
2236 'gibibytes': 1024 ** 3,
2237 'TiB': 1024 ** 4,
2238 'TB': 1000 ** 4,
2239 'tB': 1024 ** 4,
2240 'Tb': 1000 ** 4,
2241 'tb': 1000 ** 4,
2242 'terabytes': 1000 ** 4,
2243 'tebibytes': 1024 ** 4,
2244 'PiB': 1024 ** 5,
2245 'PB': 1000 ** 5,
2246 'pB': 1024 ** 5,
2247 'Pb': 1000 ** 5,
2248 'pb': 1000 ** 5,
2249 'petabytes': 1000 ** 5,
2250 'pebibytes': 1024 ** 5,
2251 'EiB': 1024 ** 6,
2252 'EB': 1000 ** 6,
2253 'eB': 1024 ** 6,
2254 'Eb': 1000 ** 6,
2255 'eb': 1000 ** 6,
2256 'exabytes': 1000 ** 6,
2257 'exbibytes': 1024 ** 6,
2258 'ZiB': 1024 ** 7,
2259 'ZB': 1000 ** 7,
2260 'zB': 1024 ** 7,
2261 'Zb': 1000 ** 7,
2262 'zb': 1000 ** 7,
2263 'zettabytes': 1000 ** 7,
2264 'zebibytes': 1024 ** 7,
2265 'YiB': 1024 ** 8,
2266 'YB': 1000 ** 8,
2267 'yB': 1024 ** 8,
2268 'Yb': 1000 ** 8,
2269 'yb': 1000 ** 8,
2270 'yottabytes': 1000 ** 8,
2271 'yobibytes': 1024 ** 8,
2272 }
2273
2274 return lookup_unit_table(_UNIT_TABLE, s)
2275
2276
2277 def parse_count(s):
2278 if s is None:
2279 return None
2280
2281 s = re.sub(r'^[^\d]+\s', '', s).strip()
2282
2283 if re.match(r'^[\d,.]+$', s):
2284 return str_to_int(s)
2285
2286 _UNIT_TABLE = {
2287 'k': 1000,
2288 'K': 1000,
2289 'm': 1000 ** 2,
2290 'M': 1000 ** 2,
2291 'kk': 1000 ** 2,
2292 'KK': 1000 ** 2,
2293 'b': 1000 ** 3,
2294 'B': 1000 ** 3,
2295 }
2296
2297 ret = lookup_unit_table(_UNIT_TABLE, s)
2298 if ret is not None:
2299 return ret
2300
2301 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2302 if mobj:
2303 return str_to_int(mobj.group(1))
2304
2305
2306 def parse_resolution(s, *, lenient=False):
2307 if s is None:
2308 return {}
2309
2310 if lenient:
2311 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2312 else:
2313 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2314 if mobj:
2315 return {
2316 'width': int(mobj.group('w')),
2317 'height': int(mobj.group('h')),
2318 }
2319
2320 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2321 if mobj:
2322 return {'height': int(mobj.group(1))}
2323
2324 mobj = re.search(r'\b([48])[kK]\b', s)
2325 if mobj:
2326 return {'height': int(mobj.group(1)) * 540}
2327
2328 return {}
2329
2330
2331 def parse_bitrate(s):
2332 if not isinstance(s, str):
2333 return
2334 mobj = re.search(r'\b(\d+)\s*kbps', s)
2335 if mobj:
2336 return int(mobj.group(1))
2337
2338
2339 def month_by_name(name, lang='en'):
2340 """ Return the number of a month by (locale-independently) English name """
2341
2342 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2343
2344 try:
2345 return month_names.index(name) + 1
2346 except ValueError:
2347 return None
2348
2349
2350 def month_by_abbreviation(abbrev):
2351 """ Return the number of a month by (locale-independently) English
2352 abbreviations """
2353
2354 try:
2355 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2356 except ValueError:
2357 return None
2358
2359
2360 def fix_xml_ampersands(xml_str):
2361 """Replace all the '&' by '&amp;' in XML"""
2362 return re.sub(
2363 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2364 '&amp;',
2365 xml_str)
2366
2367
2368 def setproctitle(title):
2369 assert isinstance(title, str)
2370
2371 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2372 try:
2373 import ctypes
2374 except ImportError:
2375 return
2376
2377 try:
2378 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2379 except OSError:
2380 return
2381 except TypeError:
2382 # LoadLibrary in Windows Python 2.7.13 only expects
2383 # a bytestring, but since unicode_literals turns
2384 # every string into a unicode string, it fails.
2385 return
2386 title_bytes = title.encode()
2387 buf = ctypes.create_string_buffer(len(title_bytes))
2388 buf.value = title_bytes
2389 try:
2390 libc.prctl(15, buf, 0, 0, 0)
2391 except AttributeError:
2392 return # Strange libc, just skip this
2393
2394
2395 def remove_start(s, start):
2396 return s[len(start):] if s is not None and s.startswith(start) else s
2397
2398
2399 def remove_end(s, end):
2400 return s[:-len(end)] if s is not None and s.endswith(end) else s
2401
2402
2403 def remove_quotes(s):
2404 if s is None or len(s) < 2:
2405 return s
2406 for quote in ('"', "'", ):
2407 if s[0] == quote and s[-1] == quote:
2408 return s[1:-1]
2409 return s
2410
2411
2412 def get_domain(url):
2413 """
2414 This implementation is inconsistent, but is kept for compatibility.
2415 Use this only for "webpage_url_domain"
2416 """
2417 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2418
2419
2420 def url_basename(url):
2421 path = urllib.parse.urlparse(url).path
2422 return path.strip('/').split('/')[-1]
2423
2424
2425 def base_url(url):
2426 return re.match(r'https?://[^?#&]+/', url).group()
2427
2428
2429 def urljoin(base, path):
2430 if isinstance(path, bytes):
2431 path = path.decode()
2432 if not isinstance(path, str) or not path:
2433 return None
2434 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2435 return path
2436 if isinstance(base, bytes):
2437 base = base.decode()
2438 if not isinstance(base, str) or not re.match(
2439 r'^(?:https?:)?//', base):
2440 return None
2441 return urllib.parse.urljoin(base, path)
2442
2443
2444 class HEADRequest(urllib.request.Request):
2445 def get_method(self):
2446 return 'HEAD'
2447
2448
2449 class PUTRequest(urllib.request.Request):
2450 def get_method(self):
2451 return 'PUT'
2452
2453
2454 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2455 if get_attr and v is not None:
2456 v = getattr(v, get_attr, None)
2457 try:
2458 return int(v) * invscale // scale
2459 except (ValueError, TypeError, OverflowError):
2460 return default
2461
2462
2463 def str_or_none(v, default=None):
2464 return default if v is None else str(v)
2465
2466
2467 def str_to_int(int_str):
2468 """ A more relaxed version of int_or_none """
2469 if isinstance(int_str, int):
2470 return int_str
2471 elif isinstance(int_str, str):
2472 int_str = re.sub(r'[,\.\+]', '', int_str)
2473 return int_or_none(int_str)
2474
2475
2476 def float_or_none(v, scale=1, invscale=1, default=None):
2477 if v is None:
2478 return default
2479 try:
2480 return float(v) * invscale / scale
2481 except (ValueError, TypeError):
2482 return default
2483
2484
2485 def bool_or_none(v, default=None):
2486 return v if isinstance(v, bool) else default
2487
2488
2489 def strip_or_none(v, default=None):
2490 return v.strip() if isinstance(v, str) else default
2491
2492
2493 def url_or_none(url):
2494 if not url or not isinstance(url, str):
2495 return None
2496 url = url.strip()
2497 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2498
2499
2500 def request_to_url(req):
2501 if isinstance(req, urllib.request.Request):
2502 return req.get_full_url()
2503 else:
2504 return req
2505
2506
2507 def strftime_or_none(timestamp, date_format, default=None):
2508 datetime_object = None
2509 try:
2510 if isinstance(timestamp, (int, float)): # unix timestamp
2511 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2512 elif isinstance(timestamp, str): # assume YYYYMMDD
2513 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2514 return datetime_object.strftime(date_format)
2515 except (ValueError, TypeError, AttributeError):
2516 return default
2517
2518
2519 def parse_duration(s):
2520 if not isinstance(s, str):
2521 return None
2522 s = s.strip()
2523 if not s:
2524 return None
2525
2526 days, hours, mins, secs, ms = [None] * 5
2527 m = re.match(r'''(?x)
2528 (?P<before_secs>
2529 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2530 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2531 (?P<ms>[.:][0-9]+)?Z?$
2532 ''', s)
2533 if m:
2534 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2535 else:
2536 m = re.match(
2537 r'''(?ix)(?:P?
2538 (?:
2539 [0-9]+\s*y(?:ears?)?,?\s*
2540 )?
2541 (?:
2542 [0-9]+\s*m(?:onths?)?,?\s*
2543 )?
2544 (?:
2545 [0-9]+\s*w(?:eeks?)?,?\s*
2546 )?
2547 (?:
2548 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2549 )?
2550 T)?
2551 (?:
2552 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2553 )?
2554 (?:
2555 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2556 )?
2557 (?:
2558 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2559 )?Z?$''', s)
2560 if m:
2561 days, hours, mins, secs, ms = m.groups()
2562 else:
2563 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2564 if m:
2565 hours, mins = m.groups()
2566 else:
2567 return None
2568
2569 if ms:
2570 ms = ms.replace(':', '.')
2571 return sum(float(part or 0) * mult for part, mult in (
2572 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2573
2574
2575 def prepend_extension(filename, ext, expected_real_ext=None):
2576 name, real_ext = os.path.splitext(filename)
2577 return (
2578 f'{name}.{ext}{real_ext}'
2579 if not expected_real_ext or real_ext[1:] == expected_real_ext
2580 else f'{filename}.{ext}')
2581
2582
2583 def replace_extension(filename, ext, expected_real_ext=None):
2584 name, real_ext = os.path.splitext(filename)
2585 return '{}.{}'.format(
2586 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2587 ext)
2588
2589
2590 def check_executable(exe, args=[]):
2591 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2592 args can be a list of arguments for a short output (like -version) """
2593 try:
2594 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2595 except OSError:
2596 return False
2597 return exe
2598
2599
2600 def _get_exe_version_output(exe, args, *, to_screen=None):
2601 if to_screen:
2602 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2603 try:
2604 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2605 # SIGTTOU if yt-dlp is run in the background.
2606 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2607 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2608 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2609 except OSError:
2610 return False
2611 return stdout
2612
2613
2614 def detect_exe_version(output, version_re=None, unrecognized='present'):
2615 assert isinstance(output, str)
2616 if version_re is None:
2617 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2618 m = re.search(version_re, output)
2619 if m:
2620 return m.group(1)
2621 else:
2622 return unrecognized
2623
2624
2625 def get_exe_version(exe, args=['--version'],
2626 version_re=None, unrecognized='present'):
2627 """ Returns the version of the specified executable,
2628 or False if the executable is not present """
2629 out = _get_exe_version_output(exe, args)
2630 return detect_exe_version(out, version_re, unrecognized) if out else False
2631
2632
2633 def frange(start=0, stop=None, step=1):
2634 """Float range"""
2635 if stop is None:
2636 start, stop = 0, start
2637 sign = [-1, 1][step > 0] if step else 0
2638 while sign * start < sign * stop:
2639 yield start
2640 start += step
2641
2642
2643 class LazyList(collections.abc.Sequence):
2644 """Lazy immutable list from an iterable
2645 Note that slices of a LazyList are lists and not LazyList"""
2646
2647 class IndexError(IndexError):
2648 pass
2649
2650 def __init__(self, iterable, *, reverse=False, _cache=None):
2651 self._iterable = iter(iterable)
2652 self._cache = [] if _cache is None else _cache
2653 self._reversed = reverse
2654
2655 def __iter__(self):
2656 if self._reversed:
2657 # We need to consume the entire iterable to iterate in reverse
2658 yield from self.exhaust()
2659 return
2660 yield from self._cache
2661 for item in self._iterable:
2662 self._cache.append(item)
2663 yield item
2664
2665 def _exhaust(self):
2666 self._cache.extend(self._iterable)
2667 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2668 return self._cache
2669
2670 def exhaust(self):
2671 """Evaluate the entire iterable"""
2672 return self._exhaust()[::-1 if self._reversed else 1]
2673
2674 @staticmethod
2675 def _reverse_index(x):
2676 return None if x is None else ~x
2677
2678 def __getitem__(self, idx):
2679 if isinstance(idx, slice):
2680 if self._reversed:
2681 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2682 start, stop, step = idx.start, idx.stop, idx.step or 1
2683 elif isinstance(idx, int):
2684 if self._reversed:
2685 idx = self._reverse_index(idx)
2686 start, stop, step = idx, idx, 0
2687 else:
2688 raise TypeError('indices must be integers or slices')
2689 if ((start or 0) < 0 or (stop or 0) < 0
2690 or (start is None and step < 0)
2691 or (stop is None and step > 0)):
2692 # We need to consume the entire iterable to be able to slice from the end
2693 # Obviously, never use this with infinite iterables
2694 self._exhaust()
2695 try:
2696 return self._cache[idx]
2697 except IndexError as e:
2698 raise self.IndexError(e) from e
2699 n = max(start or 0, stop or 0) - len(self._cache) + 1
2700 if n > 0:
2701 self._cache.extend(itertools.islice(self._iterable, n))
2702 try:
2703 return self._cache[idx]
2704 except IndexError as e:
2705 raise self.IndexError(e) from e
2706
2707 def __bool__(self):
2708 try:
2709 self[-1] if self._reversed else self[0]
2710 except self.IndexError:
2711 return False
2712 return True
2713
2714 def __len__(self):
2715 self._exhaust()
2716 return len(self._cache)
2717
2718 def __reversed__(self):
2719 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2720
2721 def __copy__(self):
2722 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2723
2724 def __repr__(self):
2725 # repr and str should mimic a list. So we exhaust the iterable
2726 return repr(self.exhaust())
2727
2728 def __str__(self):
2729 return repr(self.exhaust())
2730
2731
2732 class PagedList:
2733
2734 class IndexError(IndexError):
2735 pass
2736
2737 def __len__(self):
2738 # This is only useful for tests
2739 return len(self.getslice())
2740
2741 def __init__(self, pagefunc, pagesize, use_cache=True):
2742 self._pagefunc = pagefunc
2743 self._pagesize = pagesize
2744 self._pagecount = float('inf')
2745 self._use_cache = use_cache
2746 self._cache = {}
2747
2748 def getpage(self, pagenum):
2749 page_results = self._cache.get(pagenum)
2750 if page_results is None:
2751 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2752 if self._use_cache:
2753 self._cache[pagenum] = page_results
2754 return page_results
2755
2756 def getslice(self, start=0, end=None):
2757 return list(self._getslice(start, end))
2758
2759 def _getslice(self, start, end):
2760 raise NotImplementedError('This method must be implemented by subclasses')
2761
2762 def __getitem__(self, idx):
2763 assert self._use_cache, 'Indexing PagedList requires cache'
2764 if not isinstance(idx, int) or idx < 0:
2765 raise TypeError('indices must be non-negative integers')
2766 entries = self.getslice(idx, idx + 1)
2767 if not entries:
2768 raise self.IndexError()
2769 return entries[0]
2770
2771
2772 class OnDemandPagedList(PagedList):
2773 """Download pages until a page with less than maximum results"""
2774
2775 def _getslice(self, start, end):
2776 for pagenum in itertools.count(start // self._pagesize):
2777 firstid = pagenum * self._pagesize
2778 nextfirstid = pagenum * self._pagesize + self._pagesize
2779 if start >= nextfirstid:
2780 continue
2781
2782 startv = (
2783 start % self._pagesize
2784 if firstid <= start < nextfirstid
2785 else 0)
2786 endv = (
2787 ((end - 1) % self._pagesize) + 1
2788 if (end is not None and firstid <= end <= nextfirstid)
2789 else None)
2790
2791 try:
2792 page_results = self.getpage(pagenum)
2793 except Exception:
2794 self._pagecount = pagenum - 1
2795 raise
2796 if startv != 0 or endv is not None:
2797 page_results = page_results[startv:endv]
2798 yield from page_results
2799
2800 # A little optimization - if current page is not "full", ie. does
2801 # not contain page_size videos then we can assume that this page
2802 # is the last one - there are no more ids on further pages -
2803 # i.e. no need to query again.
2804 if len(page_results) + startv < self._pagesize:
2805 break
2806
2807 # If we got the whole page, but the next page is not interesting,
2808 # break out early as well
2809 if end == nextfirstid:
2810 break
2811
2812
2813 class InAdvancePagedList(PagedList):
2814 """PagedList with total number of pages known in advance"""
2815
2816 def __init__(self, pagefunc, pagecount, pagesize):
2817 PagedList.__init__(self, pagefunc, pagesize, True)
2818 self._pagecount = pagecount
2819
2820 def _getslice(self, start, end):
2821 start_page = start // self._pagesize
2822 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2823 skip_elems = start - start_page * self._pagesize
2824 only_more = None if end is None else end - start
2825 for pagenum in range(start_page, end_page):
2826 page_results = self.getpage(pagenum)
2827 if skip_elems:
2828 page_results = page_results[skip_elems:]
2829 skip_elems = None
2830 if only_more is not None:
2831 if len(page_results) < only_more:
2832 only_more -= len(page_results)
2833 else:
2834 yield from page_results[:only_more]
2835 break
2836 yield from page_results
2837
2838
2839 class PlaylistEntries:
2840 MissingEntry = object()
2841 is_exhausted = False
2842
2843 def __init__(self, ydl, info_dict):
2844 self.ydl = ydl
2845
2846 # _entries must be assigned now since infodict can change during iteration
2847 entries = info_dict.get('entries')
2848 if entries is None:
2849 raise EntryNotInPlaylist('There are no entries')
2850 elif isinstance(entries, list):
2851 self.is_exhausted = True
2852
2853 requested_entries = info_dict.get('requested_entries')
2854 self.is_incomplete = bool(requested_entries)
2855 if self.is_incomplete:
2856 assert self.is_exhausted
2857 self._entries = [self.MissingEntry] * max(requested_entries)
2858 for i, entry in zip(requested_entries, entries):
2859 self._entries[i - 1] = entry
2860 elif isinstance(entries, (list, PagedList, LazyList)):
2861 self._entries = entries
2862 else:
2863 self._entries = LazyList(entries)
2864
2865 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2866 (?P<start>[+-]?\d+)?
2867 (?P<range>[:-]
2868 (?P<end>[+-]?\d+|inf(?:inite)?)?
2869 (?::(?P<step>[+-]?\d+))?
2870 )?''')
2871
2872 @classmethod
2873 def parse_playlist_items(cls, string):
2874 for segment in string.split(','):
2875 if not segment:
2876 raise ValueError('There is two or more consecutive commas')
2877 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2878 if not mobj:
2879 raise ValueError(f'{segment!r} is not a valid specification')
2880 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2881 if int_or_none(step) == 0:
2882 raise ValueError(f'Step in {segment!r} cannot be zero')
2883 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2884
2885 def get_requested_items(self):
2886 playlist_items = self.ydl.params.get('playlist_items')
2887 playlist_start = self.ydl.params.get('playliststart', 1)
2888 playlist_end = self.ydl.params.get('playlistend')
2889 # For backwards compatibility, interpret -1 as whole list
2890 if playlist_end in (-1, None):
2891 playlist_end = ''
2892 if not playlist_items:
2893 playlist_items = f'{playlist_start}:{playlist_end}'
2894 elif playlist_start != 1 or playlist_end:
2895 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2896
2897 for index in self.parse_playlist_items(playlist_items):
2898 for i, entry in self[index]:
2899 yield i, entry
2900 if not entry:
2901 continue
2902 try:
2903 # TODO: Add auto-generated fields
2904 self.ydl._match_entry(entry, incomplete=True, silent=True)
2905 except (ExistingVideoReached, RejectedVideoReached):
2906 return
2907
2908 def get_full_count(self):
2909 if self.is_exhausted and not self.is_incomplete:
2910 return len(self)
2911 elif isinstance(self._entries, InAdvancePagedList):
2912 if self._entries._pagesize == 1:
2913 return self._entries._pagecount
2914
2915 @functools.cached_property
2916 def _getter(self):
2917 if isinstance(self._entries, list):
2918 def get_entry(i):
2919 try:
2920 entry = self._entries[i]
2921 except IndexError:
2922 entry = self.MissingEntry
2923 if not self.is_incomplete:
2924 raise self.IndexError()
2925 if entry is self.MissingEntry:
2926 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2927 return entry
2928 else:
2929 def get_entry(i):
2930 try:
2931 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2932 except (LazyList.IndexError, PagedList.IndexError):
2933 raise self.IndexError()
2934 return get_entry
2935
2936 def __getitem__(self, idx):
2937 if isinstance(idx, int):
2938 idx = slice(idx, idx)
2939
2940 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2941 step = 1 if idx.step is None else idx.step
2942 if idx.start is None:
2943 start = 0 if step > 0 else len(self) - 1
2944 else:
2945 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2946
2947 # NB: Do not call len(self) when idx == [:]
2948 if idx.stop is None:
2949 stop = 0 if step < 0 else float('inf')
2950 else:
2951 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2952 stop += [-1, 1][step > 0]
2953
2954 for i in frange(start, stop, step):
2955 if i < 0:
2956 continue
2957 try:
2958 entry = self._getter(i)
2959 except self.IndexError:
2960 self.is_exhausted = True
2961 if step > 0:
2962 break
2963 continue
2964 yield i + 1, entry
2965
2966 def __len__(self):
2967 return len(tuple(self[:]))
2968
2969 class IndexError(IndexError):
2970 pass
2971
2972
2973 def uppercase_escape(s):
2974 unicode_escape = codecs.getdecoder('unicode_escape')
2975 return re.sub(
2976 r'\\U[0-9a-fA-F]{8}',
2977 lambda m: unicode_escape(m.group(0))[0],
2978 s)
2979
2980
2981 def lowercase_escape(s):
2982 unicode_escape = codecs.getdecoder('unicode_escape')
2983 return re.sub(
2984 r'\\u[0-9a-fA-F]{4}',
2985 lambda m: unicode_escape(m.group(0))[0],
2986 s)
2987
2988
2989 def escape_rfc3986(s):
2990 """Escape non-ASCII characters as suggested by RFC 3986"""
2991 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2992
2993
2994 def escape_url(url):
2995 """Escape URL as suggested by RFC 3986"""
2996 url_parsed = urllib.parse.urlparse(url)
2997 return url_parsed._replace(
2998 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2999 path=escape_rfc3986(url_parsed.path),
3000 params=escape_rfc3986(url_parsed.params),
3001 query=escape_rfc3986(url_parsed.query),
3002 fragment=escape_rfc3986(url_parsed.fragment)
3003 ).geturl()
3004
3005
3006 def parse_qs(url):
3007 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3008
3009
3010 def read_batch_urls(batch_fd):
3011 def fixup(url):
3012 if not isinstance(url, str):
3013 url = url.decode('utf-8', 'replace')
3014 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3015 for bom in BOM_UTF8:
3016 if url.startswith(bom):
3017 url = url[len(bom):]
3018 url = url.lstrip()
3019 if not url or url.startswith(('#', ';', ']')):
3020 return False
3021 # "#" cannot be stripped out since it is part of the URI
3022 # However, it can be safely stripped out if following a whitespace
3023 return re.split(r'\s#', url, 1)[0].rstrip()
3024
3025 with contextlib.closing(batch_fd) as fd:
3026 return [url for url in map(fixup, fd) if url]
3027
3028
3029 def urlencode_postdata(*args, **kargs):
3030 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3031
3032
3033 def update_url_query(url, query):
3034 if not query:
3035 return url
3036 parsed_url = urllib.parse.urlparse(url)
3037 qs = urllib.parse.parse_qs(parsed_url.query)
3038 qs.update(query)
3039 return urllib.parse.urlunparse(parsed_url._replace(
3040 query=urllib.parse.urlencode(qs, True)))
3041
3042
3043 def update_Request(req, url=None, data=None, headers=None, query=None):
3044 req_headers = req.headers.copy()
3045 req_headers.update(headers or {})
3046 req_data = data or req.data
3047 req_url = update_url_query(url or req.get_full_url(), query)
3048 req_get_method = req.get_method()
3049 if req_get_method == 'HEAD':
3050 req_type = HEADRequest
3051 elif req_get_method == 'PUT':
3052 req_type = PUTRequest
3053 else:
3054 req_type = urllib.request.Request
3055 new_req = req_type(
3056 req_url, data=req_data, headers=req_headers,
3057 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3058 if hasattr(req, 'timeout'):
3059 new_req.timeout = req.timeout
3060 return new_req
3061
3062
3063 def _multipart_encode_impl(data, boundary):
3064 content_type = 'multipart/form-data; boundary=%s' % boundary
3065
3066 out = b''
3067 for k, v in data.items():
3068 out += b'--' + boundary.encode('ascii') + b'\r\n'
3069 if isinstance(k, str):
3070 k = k.encode()
3071 if isinstance(v, str):
3072 v = v.encode()
3073 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3074 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3075 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3076 if boundary.encode('ascii') in content:
3077 raise ValueError('Boundary overlaps with data')
3078 out += content
3079
3080 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3081
3082 return out, content_type
3083
3084
3085 def multipart_encode(data, boundary=None):
3086 '''
3087 Encode a dict to RFC 7578-compliant form-data
3088
3089 data:
3090 A dict where keys and values can be either Unicode or bytes-like
3091 objects.
3092 boundary:
3093 If specified a Unicode object, it's used as the boundary. Otherwise
3094 a random boundary is generated.
3095
3096 Reference: https://tools.ietf.org/html/rfc7578
3097 '''
3098 has_specified_boundary = boundary is not None
3099
3100 while True:
3101 if boundary is None:
3102 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3103
3104 try:
3105 out, content_type = _multipart_encode_impl(data, boundary)
3106 break
3107 except ValueError:
3108 if has_specified_boundary:
3109 raise
3110 boundary = None
3111
3112 return out, content_type
3113
3114
3115 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3116 for val in map(d.get, variadic(key_or_keys)):
3117 if val is not None and (val or not skip_false_values):
3118 return val
3119 return default
3120
3121
3122 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3123 for f in funcs:
3124 try:
3125 val = f(*args, **kwargs)
3126 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3127 pass
3128 else:
3129 if expected_type is None or isinstance(val, expected_type):
3130 return val
3131
3132
3133 def try_get(src, getter, expected_type=None):
3134 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3135
3136
3137 def filter_dict(dct, cndn=lambda _, v: v is not None):
3138 return {k: v for k, v in dct.items() if cndn(k, v)}
3139
3140
3141 def merge_dicts(*dicts):
3142 merged = {}
3143 for a_dict in dicts:
3144 for k, v in a_dict.items():
3145 if (v is not None and k not in merged
3146 or isinstance(v, str) and merged[k] == ''):
3147 merged[k] = v
3148 return merged
3149
3150
3151 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3152 return string if isinstance(string, str) else str(string, encoding, errors)
3153
3154
3155 US_RATINGS = {
3156 'G': 0,
3157 'PG': 10,
3158 'PG-13': 13,
3159 'R': 16,
3160 'NC': 18,
3161 }
3162
3163
3164 TV_PARENTAL_GUIDELINES = {
3165 'TV-Y': 0,
3166 'TV-Y7': 7,
3167 'TV-G': 0,
3168 'TV-PG': 0,
3169 'TV-14': 14,
3170 'TV-MA': 17,
3171 }
3172
3173
3174 def parse_age_limit(s):
3175 # isinstance(False, int) is True. So type() must be used instead
3176 if type(s) is int: # noqa: E721
3177 return s if 0 <= s <= 21 else None
3178 elif not isinstance(s, str):
3179 return None
3180 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3181 if m:
3182 return int(m.group('age'))
3183 s = s.upper()
3184 if s in US_RATINGS:
3185 return US_RATINGS[s]
3186 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3187 if m:
3188 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3189 return None
3190
3191
3192 def strip_jsonp(code):
3193 return re.sub(
3194 r'''(?sx)^
3195 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3196 (?:\s*&&\s*(?P=func_name))?
3197 \s*\(\s*(?P<callback_data>.*)\);?
3198 \s*?(?://[^\n]*)*$''',
3199 r'\g<callback_data>', code)
3200
3201
3202 def js_to_json(code, vars={}):
3203 # vars is a dict of var, val pairs to substitute
3204 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3205 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3206 INTEGER_TABLE = (
3207 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3208 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3209 )
3210
3211 def fix_kv(m):
3212 v = m.group(0)
3213 if v in ('true', 'false', 'null'):
3214 return v
3215 elif v in ('undefined', 'void 0'):
3216 return 'null'
3217 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3218 return ""
3219
3220 if v[0] in ("'", '"'):
3221 v = re.sub(r'(?s)\\.|"', lambda m: {
3222 '"': '\\"',
3223 "\\'": "'",
3224 '\\\n': '',
3225 '\\x': '\\u00',
3226 }.get(m.group(0), m.group(0)), v[1:-1])
3227 else:
3228 for regex, base in INTEGER_TABLE:
3229 im = re.match(regex, v)
3230 if im:
3231 i = int(im.group(1), base)
3232 return '"%d":' % i if v.endswith(':') else '%d' % i
3233
3234 if v in vars:
3235 return vars[v]
3236
3237 return '"%s"' % v
3238
3239 def create_map(mobj):
3240 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3241
3242 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3243 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3244
3245 return re.sub(r'''(?sx)
3246 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3247 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3248 {comment}|,(?={skip}[\]}}])|
3249 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3250 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3251 [0-9]+(?={skip}:)|
3252 !+
3253 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3254
3255
3256 def qualities(quality_ids):
3257 """ Get a numeric quality value out of a list of possible values """
3258 def q(qid):
3259 try:
3260 return quality_ids.index(qid)
3261 except ValueError:
3262 return -1
3263 return q
3264
3265
3266 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3267
3268
3269 DEFAULT_OUTTMPL = {
3270 'default': '%(title)s [%(id)s].%(ext)s',
3271 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3272 }
3273 OUTTMPL_TYPES = {
3274 'chapter': None,
3275 'subtitle': None,
3276 'thumbnail': None,
3277 'description': 'description',
3278 'annotation': 'annotations.xml',
3279 'infojson': 'info.json',
3280 'link': None,
3281 'pl_video': None,
3282 'pl_thumbnail': None,
3283 'pl_description': 'description',
3284 'pl_infojson': 'info.json',
3285 }
3286
3287 # As of [1] format syntax is:
3288 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3289 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3290 STR_FORMAT_RE_TMPL = r'''(?x)
3291 (?<!%)(?P<prefix>(?:%%)*)
3292 %
3293 (?P<has_key>\((?P<key>{0})\))?
3294 (?P<format>
3295 (?P<conversion>[#0\-+ ]+)?
3296 (?P<min_width>\d+)?
3297 (?P<precision>\.\d+)?
3298 (?P<len_mod>[hlL])? # unused in python
3299 {1} # conversion type
3300 )
3301 '''
3302
3303
3304 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3305
3306
3307 def limit_length(s, length):
3308 """ Add ellipses to overly long strings """
3309 if s is None:
3310 return None
3311 ELLIPSES = '...'
3312 if len(s) > length:
3313 return s[:length - len(ELLIPSES)] + ELLIPSES
3314 return s
3315
3316
3317 def version_tuple(v):
3318 return tuple(int(e) for e in re.split(r'[-.]', v))
3319
3320
3321 def is_outdated_version(version, limit, assume_new=True):
3322 if not version:
3323 return not assume_new
3324 try:
3325 return version_tuple(version) < version_tuple(limit)
3326 except ValueError:
3327 return not assume_new
3328
3329
3330 def ytdl_is_updateable():
3331 """ Returns if yt-dlp can be updated with -U """
3332
3333 from .update import is_non_updateable
3334
3335 return not is_non_updateable()
3336
3337
3338 def args_to_str(args):
3339 # Get a short string representation for a subprocess command
3340 return ' '.join(compat_shlex_quote(a) for a in args)
3341
3342
3343 def error_to_compat_str(err):
3344 return str(err)
3345
3346
3347 def error_to_str(err):
3348 return f'{type(err).__name__}: {err}'
3349
3350
3351 def mimetype2ext(mt):
3352 if mt is None:
3353 return None
3354
3355 mt, _, params = mt.partition(';')
3356 mt = mt.strip()
3357
3358 FULL_MAP = {
3359 'audio/mp4': 'm4a',
3360 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3361 # it's the most popular one
3362 'audio/mpeg': 'mp3',
3363 'audio/x-wav': 'wav',
3364 'audio/wav': 'wav',
3365 'audio/wave': 'wav',
3366 }
3367
3368 ext = FULL_MAP.get(mt)
3369 if ext is not None:
3370 return ext
3371
3372 SUBTYPE_MAP = {
3373 '3gpp': '3gp',
3374 'smptett+xml': 'tt',
3375 'ttaf+xml': 'dfxp',
3376 'ttml+xml': 'ttml',
3377 'x-flv': 'flv',
3378 'x-mp4-fragmented': 'mp4',
3379 'x-ms-sami': 'sami',
3380 'x-ms-wmv': 'wmv',
3381 'mpegurl': 'm3u8',
3382 'x-mpegurl': 'm3u8',
3383 'vnd.apple.mpegurl': 'm3u8',
3384 'dash+xml': 'mpd',
3385 'f4m+xml': 'f4m',
3386 'hds+xml': 'f4m',
3387 'vnd.ms-sstr+xml': 'ism',
3388 'quicktime': 'mov',
3389 'mp2t': 'ts',
3390 'x-wav': 'wav',
3391 'filmstrip+json': 'fs',
3392 'svg+xml': 'svg',
3393 }
3394
3395 _, _, subtype = mt.rpartition('/')
3396 ext = SUBTYPE_MAP.get(subtype.lower())
3397 if ext is not None:
3398 return ext
3399
3400 SUFFIX_MAP = {
3401 'json': 'json',
3402 'xml': 'xml',
3403 'zip': 'zip',
3404 'gzip': 'gz',
3405 }
3406
3407 _, _, suffix = subtype.partition('+')
3408 ext = SUFFIX_MAP.get(suffix)
3409 if ext is not None:
3410 return ext
3411
3412 return subtype.replace('+', '.')
3413
3414
3415 def ext2mimetype(ext_or_url):
3416 if not ext_or_url:
3417 return None
3418 if '.' not in ext_or_url:
3419 ext_or_url = f'file.{ext_or_url}'
3420 return mimetypes.guess_type(ext_or_url)[0]
3421
3422
3423 def parse_codecs(codecs_str):
3424 # http://tools.ietf.org/html/rfc6381
3425 if not codecs_str:
3426 return {}
3427 split_codecs = list(filter(None, map(
3428 str.strip, codecs_str.strip().strip(',').split(','))))
3429 vcodec, acodec, scodec, hdr = None, None, None, None
3430 for full_codec in split_codecs:
3431 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3432 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3433 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3434 if vcodec:
3435 continue
3436 vcodec = full_codec
3437 if parts[0] in ('dvh1', 'dvhe'):
3438 hdr = 'DV'
3439 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3440 hdr = 'HDR10'
3441 elif parts[:2] == ['vp9', '2']:
3442 hdr = 'HDR10'
3443 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3444 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3445 acodec = acodec or full_codec
3446 elif parts[0] in ('stpp', 'wvtt'):
3447 scodec = scodec or full_codec
3448 else:
3449 write_string(f'WARNING: Unknown codec {full_codec}\n')
3450 if vcodec or acodec or scodec:
3451 return {
3452 'vcodec': vcodec or 'none',
3453 'acodec': acodec or 'none',
3454 'dynamic_range': hdr,
3455 **({'scodec': scodec} if scodec is not None else {}),
3456 }
3457 elif len(split_codecs) == 2:
3458 return {
3459 'vcodec': split_codecs[0],
3460 'acodec': split_codecs[1],
3461 }
3462 return {}
3463
3464
3465 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3466 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3467
3468 allow_mkv = not preferences or 'mkv' in preferences
3469
3470 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3471 return 'mkv' # TODO: any other format allows this?
3472
3473 # TODO: All codecs supported by parse_codecs isn't handled here
3474 COMPATIBLE_CODECS = {
3475 'mp4': {
3476 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
3477 'h264', 'aacl', # Set in ISM
3478 },
3479 'webm': {
3480 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3481 'vp9x', 'vp8x', # in the webm spec
3482 },
3483 }
3484
3485 sanitize_codec = functools.partial(try_get, getter=lambda x: x.split('.')[0].replace('0', ''))
3486 vcodec, acodec = sanitize_codec(vcodecs[0]), sanitize_codec(acodecs[0])
3487
3488 for ext in preferences or COMPATIBLE_CODECS.keys():
3489 codec_set = COMPATIBLE_CODECS.get(ext, set())
3490 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3491 return ext
3492
3493 COMPATIBLE_EXTS = (
3494 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3495 {'webm'},
3496 )
3497 for ext in preferences or vexts:
3498 current_exts = {ext, *vexts, *aexts}
3499 if ext == 'mkv' or current_exts == {ext} or any(
3500 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3501 return ext
3502 return 'mkv' if allow_mkv else preferences[-1]
3503
3504
3505 def urlhandle_detect_ext(url_handle):
3506 getheader = url_handle.headers.get
3507
3508 cd = getheader('Content-Disposition')
3509 if cd:
3510 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3511 if m:
3512 e = determine_ext(m.group('filename'), default_ext=None)
3513 if e:
3514 return e
3515
3516 return mimetype2ext(getheader('Content-Type'))
3517
3518
3519 def encode_data_uri(data, mime_type):
3520 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3521
3522
3523 def age_restricted(content_limit, age_limit):
3524 """ Returns True iff the content should be blocked """
3525
3526 if age_limit is None: # No limit set
3527 return False
3528 if content_limit is None:
3529 return False # Content available for everyone
3530 return age_limit < content_limit
3531
3532
3533 # List of known byte-order-marks (BOM)
3534 BOMS = [
3535 (b'\xef\xbb\xbf', 'utf-8'),
3536 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3537 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3538 (b'\xff\xfe', 'utf-16-le'),
3539 (b'\xfe\xff', 'utf-16-be'),
3540 ]
3541
3542
3543 def is_html(first_bytes):
3544 """ Detect whether a file contains HTML by examining its first bytes. """
3545
3546 encoding = 'utf-8'
3547 for bom, enc in BOMS:
3548 while first_bytes.startswith(bom):
3549 encoding, first_bytes = enc, first_bytes[len(bom):]
3550
3551 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3552
3553
3554 def determine_protocol(info_dict):
3555 protocol = info_dict.get('protocol')
3556 if protocol is not None:
3557 return protocol
3558
3559 url = sanitize_url(info_dict['url'])
3560 if url.startswith('rtmp'):
3561 return 'rtmp'
3562 elif url.startswith('mms'):
3563 return 'mms'
3564 elif url.startswith('rtsp'):
3565 return 'rtsp'
3566
3567 ext = determine_ext(url)
3568 if ext == 'm3u8':
3569 return 'm3u8'
3570 elif ext == 'f4m':
3571 return 'f4m'
3572
3573 return urllib.parse.urlparse(url).scheme
3574
3575
3576 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3577 """ Render a list of rows, each as a list of values.
3578 Text after a \t will be right aligned """
3579 def width(string):
3580 return len(remove_terminal_sequences(string).replace('\t', ''))
3581
3582 def get_max_lens(table):
3583 return [max(width(str(v)) for v in col) for col in zip(*table)]
3584
3585 def filter_using_list(row, filterArray):
3586 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3587
3588 max_lens = get_max_lens(data) if hide_empty else []
3589 header_row = filter_using_list(header_row, max_lens)
3590 data = [filter_using_list(row, max_lens) for row in data]
3591
3592 table = [header_row] + data
3593 max_lens = get_max_lens(table)
3594 extra_gap += 1
3595 if delim:
3596 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3597 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3598 for row in table:
3599 for pos, text in enumerate(map(str, row)):
3600 if '\t' in text:
3601 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3602 else:
3603 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3604 ret = '\n'.join(''.join(row).rstrip() for row in table)
3605 return ret
3606
3607
3608 def _match_one(filter_part, dct, incomplete):
3609 # TODO: Generalize code with YoutubeDL._build_format_filter
3610 STRING_OPERATORS = {
3611 '*=': operator.contains,
3612 '^=': lambda attr, value: attr.startswith(value),
3613 '$=': lambda attr, value: attr.endswith(value),
3614 '~=': lambda attr, value: re.search(value, attr),
3615 }
3616 COMPARISON_OPERATORS = {
3617 **STRING_OPERATORS,
3618 '<=': operator.le, # "<=" must be defined above "<"
3619 '<': operator.lt,
3620 '>=': operator.ge,
3621 '>': operator.gt,
3622 '=': operator.eq,
3623 }
3624
3625 if isinstance(incomplete, bool):
3626 is_incomplete = lambda _: incomplete
3627 else:
3628 is_incomplete = lambda k: k in incomplete
3629
3630 operator_rex = re.compile(r'''(?x)
3631 (?P<key>[a-z_]+)
3632 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3633 (?:
3634 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3635 (?P<strval>.+?)
3636 )
3637 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3638 m = operator_rex.fullmatch(filter_part.strip())
3639 if m:
3640 m = m.groupdict()
3641 unnegated_op = COMPARISON_OPERATORS[m['op']]
3642 if m['negation']:
3643 op = lambda attr, value: not unnegated_op(attr, value)
3644 else:
3645 op = unnegated_op
3646 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3647 if m['quote']:
3648 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3649 actual_value = dct.get(m['key'])
3650 numeric_comparison = None
3651 if isinstance(actual_value, (int, float)):
3652 # If the original field is a string and matching comparisonvalue is
3653 # a number we should respect the origin of the original field
3654 # and process comparison value as a string (see
3655 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3656 try:
3657 numeric_comparison = int(comparison_value)
3658 except ValueError:
3659 numeric_comparison = parse_filesize(comparison_value)
3660 if numeric_comparison is None:
3661 numeric_comparison = parse_filesize(f'{comparison_value}B')
3662 if numeric_comparison is None:
3663 numeric_comparison = parse_duration(comparison_value)
3664 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3665 raise ValueError('Operator %s only supports string values!' % m['op'])
3666 if actual_value is None:
3667 return is_incomplete(m['key']) or m['none_inclusive']
3668 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3669
3670 UNARY_OPERATORS = {
3671 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3672 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3673 }
3674 operator_rex = re.compile(r'''(?x)
3675 (?P<op>%s)\s*(?P<key>[a-z_]+)
3676 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3677 m = operator_rex.fullmatch(filter_part.strip())
3678 if m:
3679 op = UNARY_OPERATORS[m.group('op')]
3680 actual_value = dct.get(m.group('key'))
3681 if is_incomplete(m.group('key')) and actual_value is None:
3682 return True
3683 return op(actual_value)
3684
3685 raise ValueError('Invalid filter part %r' % filter_part)
3686
3687
3688 def match_str(filter_str, dct, incomplete=False):
3689 """ Filter a dictionary with a simple string syntax.
3690 @returns Whether the filter passes
3691 @param incomplete Set of keys that is expected to be missing from dct.
3692 Can be True/False to indicate all/none of the keys may be missing.
3693 All conditions on incomplete keys pass if the key is missing
3694 """
3695 return all(
3696 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3697 for filter_part in re.split(r'(?<!\\)&', filter_str))
3698
3699
3700 def match_filter_func(filters):
3701 if not filters:
3702 return None
3703 filters = set(variadic(filters))
3704
3705 interactive = '-' in filters
3706 if interactive:
3707 filters.remove('-')
3708
3709 def _match_func(info_dict, incomplete=False):
3710 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3711 return NO_DEFAULT if interactive and not incomplete else None
3712 else:
3713 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3714 filter_str = ') | ('.join(map(str.strip, filters))
3715 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3716 return _match_func
3717
3718
3719 class download_range_func:
3720 def __init__(self, chapters, ranges):
3721 self.chapters, self.ranges = chapters, ranges
3722
3723 def __call__(self, info_dict, ydl):
3724 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3725 else 'Cannot match chapters since chapter information is unavailable')
3726 for regex in self.chapters or []:
3727 for i, chapter in enumerate(info_dict.get('chapters') or []):
3728 if re.search(regex, chapter['title']):
3729 warning = None
3730 yield {**chapter, 'index': i}
3731 if self.chapters and warning:
3732 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3733
3734 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3735
3736 def __eq__(self, other):
3737 return (isinstance(other, download_range_func)
3738 and self.chapters == other.chapters and self.ranges == other.ranges)
3739
3740
3741 def parse_dfxp_time_expr(time_expr):
3742 if not time_expr:
3743 return
3744
3745 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3746 if mobj:
3747 return float(mobj.group('time_offset'))
3748
3749 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3750 if mobj:
3751 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3752
3753
3754 def srt_subtitles_timecode(seconds):
3755 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3756
3757
3758 def ass_subtitles_timecode(seconds):
3759 time = timetuple_from_msec(seconds * 1000)
3760 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3761
3762
3763 def dfxp2srt(dfxp_data):
3764 '''
3765 @param dfxp_data A bytes-like object containing DFXP data
3766 @returns A unicode object containing converted SRT data
3767 '''
3768 LEGACY_NAMESPACES = (
3769 (b'http://www.w3.org/ns/ttml', [
3770 b'http://www.w3.org/2004/11/ttaf1',
3771 b'http://www.w3.org/2006/04/ttaf1',
3772 b'http://www.w3.org/2006/10/ttaf1',
3773 ]),
3774 (b'http://www.w3.org/ns/ttml#styling', [
3775 b'http://www.w3.org/ns/ttml#style',
3776 ]),
3777 )
3778
3779 SUPPORTED_STYLING = [
3780 'color',
3781 'fontFamily',
3782 'fontSize',
3783 'fontStyle',
3784 'fontWeight',
3785 'textDecoration'
3786 ]
3787
3788 _x = functools.partial(xpath_with_ns, ns_map={
3789 'xml': 'http://www.w3.org/XML/1998/namespace',
3790 'ttml': 'http://www.w3.org/ns/ttml',
3791 'tts': 'http://www.w3.org/ns/ttml#styling',
3792 })
3793
3794 styles = {}
3795 default_style = {}
3796
3797 class TTMLPElementParser:
3798 _out = ''
3799 _unclosed_elements = []
3800 _applied_styles = []
3801
3802 def start(self, tag, attrib):
3803 if tag in (_x('ttml:br'), 'br'):
3804 self._out += '\n'
3805 else:
3806 unclosed_elements = []
3807 style = {}
3808 element_style_id = attrib.get('style')
3809 if default_style:
3810 style.update(default_style)
3811 if element_style_id:
3812 style.update(styles.get(element_style_id, {}))
3813 for prop in SUPPORTED_STYLING:
3814 prop_val = attrib.get(_x('tts:' + prop))
3815 if prop_val:
3816 style[prop] = prop_val
3817 if style:
3818 font = ''
3819 for k, v in sorted(style.items()):
3820 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3821 continue
3822 if k == 'color':
3823 font += ' color="%s"' % v
3824 elif k == 'fontSize':
3825 font += ' size="%s"' % v
3826 elif k == 'fontFamily':
3827 font += ' face="%s"' % v
3828 elif k == 'fontWeight' and v == 'bold':
3829 self._out += '<b>'
3830 unclosed_elements.append('b')
3831 elif k == 'fontStyle' and v == 'italic':
3832 self._out += '<i>'
3833 unclosed_elements.append('i')
3834 elif k == 'textDecoration' and v == 'underline':
3835 self._out += '<u>'
3836 unclosed_elements.append('u')
3837 if font:
3838 self._out += '<font' + font + '>'
3839 unclosed_elements.append('font')
3840 applied_style = {}
3841 if self._applied_styles:
3842 applied_style.update(self._applied_styles[-1])
3843 applied_style.update(style)
3844 self._applied_styles.append(applied_style)
3845 self._unclosed_elements.append(unclosed_elements)
3846
3847 def end(self, tag):
3848 if tag not in (_x('ttml:br'), 'br'):
3849 unclosed_elements = self._unclosed_elements.pop()
3850 for element in reversed(unclosed_elements):
3851 self._out += '</%s>' % element
3852 if unclosed_elements and self._applied_styles:
3853 self._applied_styles.pop()
3854
3855 def data(self, data):
3856 self._out += data
3857
3858 def close(self):
3859 return self._out.strip()
3860
3861 def parse_node(node):
3862 target = TTMLPElementParser()
3863 parser = xml.etree.ElementTree.XMLParser(target=target)
3864 parser.feed(xml.etree.ElementTree.tostring(node))
3865 return parser.close()
3866
3867 for k, v in LEGACY_NAMESPACES:
3868 for ns in v:
3869 dfxp_data = dfxp_data.replace(ns, k)
3870
3871 dfxp = compat_etree_fromstring(dfxp_data)
3872 out = []
3873 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3874
3875 if not paras:
3876 raise ValueError('Invalid dfxp/TTML subtitle')
3877
3878 repeat = False
3879 while True:
3880 for style in dfxp.findall(_x('.//ttml:style')):
3881 style_id = style.get('id') or style.get(_x('xml:id'))
3882 if not style_id:
3883 continue
3884 parent_style_id = style.get('style')
3885 if parent_style_id:
3886 if parent_style_id not in styles:
3887 repeat = True
3888 continue
3889 styles[style_id] = styles[parent_style_id].copy()
3890 for prop in SUPPORTED_STYLING:
3891 prop_val = style.get(_x('tts:' + prop))
3892 if prop_val:
3893 styles.setdefault(style_id, {})[prop] = prop_val
3894 if repeat:
3895 repeat = False
3896 else:
3897 break
3898
3899 for p in ('body', 'div'):
3900 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3901 if ele is None:
3902 continue
3903 style = styles.get(ele.get('style'))
3904 if not style:
3905 continue
3906 default_style.update(style)
3907
3908 for para, index in zip(paras, itertools.count(1)):
3909 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3910 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3911 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3912 if begin_time is None:
3913 continue
3914 if not end_time:
3915 if not dur:
3916 continue
3917 end_time = begin_time + dur
3918 out.append('%d\n%s --> %s\n%s\n\n' % (
3919 index,
3920 srt_subtitles_timecode(begin_time),
3921 srt_subtitles_timecode(end_time),
3922 parse_node(para)))
3923
3924 return ''.join(out)
3925
3926
3927 def cli_option(params, command_option, param, separator=None):
3928 param = params.get(param)
3929 return ([] if param is None
3930 else [command_option, str(param)] if separator is None
3931 else [f'{command_option}{separator}{param}'])
3932
3933
3934 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3935 param = params.get(param)
3936 assert param in (True, False, None)
3937 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3938
3939
3940 def cli_valueless_option(params, command_option, param, expected_value=True):
3941 return [command_option] if params.get(param) == expected_value else []
3942
3943
3944 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3945 if isinstance(argdict, (list, tuple)): # for backward compatibility
3946 if use_compat:
3947 return argdict
3948 else:
3949 argdict = None
3950 if argdict is None:
3951 return default
3952 assert isinstance(argdict, dict)
3953
3954 assert isinstance(keys, (list, tuple))
3955 for key_list in keys:
3956 arg_list = list(filter(
3957 lambda x: x is not None,
3958 [argdict.get(key.lower()) for key in variadic(key_list)]))
3959 if arg_list:
3960 return [arg for args in arg_list for arg in args]
3961 return default
3962
3963
3964 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3965 main_key, exe = main_key.lower(), exe.lower()
3966 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3967 keys = [f'{root_key}{k}' for k in (keys or [''])]
3968 if root_key in keys:
3969 if main_key != exe:
3970 keys.append((main_key, exe))
3971 keys.append('default')
3972 else:
3973 use_compat = False
3974 return cli_configuration_args(argdict, keys, default, use_compat)
3975
3976
3977 class ISO639Utils:
3978 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3979 _lang_map = {
3980 'aa': 'aar',
3981 'ab': 'abk',
3982 'ae': 'ave',
3983 'af': 'afr',
3984 'ak': 'aka',
3985 'am': 'amh',
3986 'an': 'arg',
3987 'ar': 'ara',
3988 'as': 'asm',
3989 'av': 'ava',
3990 'ay': 'aym',
3991 'az': 'aze',
3992 'ba': 'bak',
3993 'be': 'bel',
3994 'bg': 'bul',
3995 'bh': 'bih',
3996 'bi': 'bis',
3997 'bm': 'bam',
3998 'bn': 'ben',
3999 'bo': 'bod',
4000 'br': 'bre',
4001 'bs': 'bos',
4002 'ca': 'cat',
4003 'ce': 'che',
4004 'ch': 'cha',
4005 'co': 'cos',
4006 'cr': 'cre',
4007 'cs': 'ces',
4008 'cu': 'chu',
4009 'cv': 'chv',
4010 'cy': 'cym',
4011 'da': 'dan',
4012 'de': 'deu',
4013 'dv': 'div',
4014 'dz': 'dzo',
4015 'ee': 'ewe',
4016 'el': 'ell',
4017 'en': 'eng',
4018 'eo': 'epo',
4019 'es': 'spa',
4020 'et': 'est',
4021 'eu': 'eus',
4022 'fa': 'fas',
4023 'ff': 'ful',
4024 'fi': 'fin',
4025 'fj': 'fij',
4026 'fo': 'fao',
4027 'fr': 'fra',
4028 'fy': 'fry',
4029 'ga': 'gle',
4030 'gd': 'gla',
4031 'gl': 'glg',
4032 'gn': 'grn',
4033 'gu': 'guj',
4034 'gv': 'glv',
4035 'ha': 'hau',
4036 'he': 'heb',
4037 'iw': 'heb', # Replaced by he in 1989 revision
4038 'hi': 'hin',
4039 'ho': 'hmo',
4040 'hr': 'hrv',
4041 'ht': 'hat',
4042 'hu': 'hun',
4043 'hy': 'hye',
4044 'hz': 'her',
4045 'ia': 'ina',
4046 'id': 'ind',
4047 'in': 'ind', # Replaced by id in 1989 revision
4048 'ie': 'ile',
4049 'ig': 'ibo',
4050 'ii': 'iii',
4051 'ik': 'ipk',
4052 'io': 'ido',
4053 'is': 'isl',
4054 'it': 'ita',
4055 'iu': 'iku',
4056 'ja': 'jpn',
4057 'jv': 'jav',
4058 'ka': 'kat',
4059 'kg': 'kon',
4060 'ki': 'kik',
4061 'kj': 'kua',
4062 'kk': 'kaz',
4063 'kl': 'kal',
4064 'km': 'khm',
4065 'kn': 'kan',
4066 'ko': 'kor',
4067 'kr': 'kau',
4068 'ks': 'kas',
4069 'ku': 'kur',
4070 'kv': 'kom',
4071 'kw': 'cor',
4072 'ky': 'kir',
4073 'la': 'lat',
4074 'lb': 'ltz',
4075 'lg': 'lug',
4076 'li': 'lim',
4077 'ln': 'lin',
4078 'lo': 'lao',
4079 'lt': 'lit',
4080 'lu': 'lub',
4081 'lv': 'lav',
4082 'mg': 'mlg',
4083 'mh': 'mah',
4084 'mi': 'mri',
4085 'mk': 'mkd',
4086 'ml': 'mal',
4087 'mn': 'mon',
4088 'mr': 'mar',
4089 'ms': 'msa',
4090 'mt': 'mlt',
4091 'my': 'mya',
4092 'na': 'nau',
4093 'nb': 'nob',
4094 'nd': 'nde',
4095 'ne': 'nep',
4096 'ng': 'ndo',
4097 'nl': 'nld',
4098 'nn': 'nno',
4099 'no': 'nor',
4100 'nr': 'nbl',
4101 'nv': 'nav',
4102 'ny': 'nya',
4103 'oc': 'oci',
4104 'oj': 'oji',
4105 'om': 'orm',
4106 'or': 'ori',
4107 'os': 'oss',
4108 'pa': 'pan',
4109 'pi': 'pli',
4110 'pl': 'pol',
4111 'ps': 'pus',
4112 'pt': 'por',
4113 'qu': 'que',
4114 'rm': 'roh',
4115 'rn': 'run',
4116 'ro': 'ron',
4117 'ru': 'rus',
4118 'rw': 'kin',
4119 'sa': 'san',
4120 'sc': 'srd',
4121 'sd': 'snd',
4122 'se': 'sme',
4123 'sg': 'sag',
4124 'si': 'sin',
4125 'sk': 'slk',
4126 'sl': 'slv',
4127 'sm': 'smo',
4128 'sn': 'sna',
4129 'so': 'som',
4130 'sq': 'sqi',
4131 'sr': 'srp',
4132 'ss': 'ssw',
4133 'st': 'sot',
4134 'su': 'sun',
4135 'sv': 'swe',
4136 'sw': 'swa',
4137 'ta': 'tam',
4138 'te': 'tel',
4139 'tg': 'tgk',
4140 'th': 'tha',
4141 'ti': 'tir',
4142 'tk': 'tuk',
4143 'tl': 'tgl',
4144 'tn': 'tsn',
4145 'to': 'ton',
4146 'tr': 'tur',
4147 'ts': 'tso',
4148 'tt': 'tat',
4149 'tw': 'twi',
4150 'ty': 'tah',
4151 'ug': 'uig',
4152 'uk': 'ukr',
4153 'ur': 'urd',
4154 'uz': 'uzb',
4155 've': 'ven',
4156 'vi': 'vie',
4157 'vo': 'vol',
4158 'wa': 'wln',
4159 'wo': 'wol',
4160 'xh': 'xho',
4161 'yi': 'yid',
4162 'ji': 'yid', # Replaced by yi in 1989 revision
4163 'yo': 'yor',
4164 'za': 'zha',
4165 'zh': 'zho',
4166 'zu': 'zul',
4167 }
4168
4169 @classmethod
4170 def short2long(cls, code):
4171 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4172 return cls._lang_map.get(code[:2])
4173
4174 @classmethod
4175 def long2short(cls, code):
4176 """Convert language code from ISO 639-2/T to ISO 639-1"""
4177 for short_name, long_name in cls._lang_map.items():
4178 if long_name == code:
4179 return short_name
4180
4181
4182 class ISO3166Utils:
4183 # From http://data.okfn.org/data/core/country-list
4184 _country_map = {
4185 'AF': 'Afghanistan',
4186 'AX': 'Åland Islands',
4187 'AL': 'Albania',
4188 'DZ': 'Algeria',
4189 'AS': 'American Samoa',
4190 'AD': 'Andorra',
4191 'AO': 'Angola',
4192 'AI': 'Anguilla',
4193 'AQ': 'Antarctica',
4194 'AG': 'Antigua and Barbuda',
4195 'AR': 'Argentina',
4196 'AM': 'Armenia',
4197 'AW': 'Aruba',
4198 'AU': 'Australia',
4199 'AT': 'Austria',
4200 'AZ': 'Azerbaijan',
4201 'BS': 'Bahamas',
4202 'BH': 'Bahrain',
4203 'BD': 'Bangladesh',
4204 'BB': 'Barbados',
4205 'BY': 'Belarus',
4206 'BE': 'Belgium',
4207 'BZ': 'Belize',
4208 'BJ': 'Benin',
4209 'BM': 'Bermuda',
4210 'BT': 'Bhutan',
4211 'BO': 'Bolivia, Plurinational State of',
4212 'BQ': 'Bonaire, Sint Eustatius and Saba',
4213 'BA': 'Bosnia and Herzegovina',
4214 'BW': 'Botswana',
4215 'BV': 'Bouvet Island',
4216 'BR': 'Brazil',
4217 'IO': 'British Indian Ocean Territory',
4218 'BN': 'Brunei Darussalam',
4219 'BG': 'Bulgaria',
4220 'BF': 'Burkina Faso',
4221 'BI': 'Burundi',
4222 'KH': 'Cambodia',
4223 'CM': 'Cameroon',
4224 'CA': 'Canada',
4225 'CV': 'Cape Verde',
4226 'KY': 'Cayman Islands',
4227 'CF': 'Central African Republic',
4228 'TD': 'Chad',
4229 'CL': 'Chile',
4230 'CN': 'China',
4231 'CX': 'Christmas Island',
4232 'CC': 'Cocos (Keeling) Islands',
4233 'CO': 'Colombia',
4234 'KM': 'Comoros',
4235 'CG': 'Congo',
4236 'CD': 'Congo, the Democratic Republic of the',
4237 'CK': 'Cook Islands',
4238 'CR': 'Costa Rica',
4239 'CI': 'Côte d\'Ivoire',
4240 'HR': 'Croatia',
4241 'CU': 'Cuba',
4242 'CW': 'Curaçao',
4243 'CY': 'Cyprus',
4244 'CZ': 'Czech Republic',
4245 'DK': 'Denmark',
4246 'DJ': 'Djibouti',
4247 'DM': 'Dominica',
4248 'DO': 'Dominican Republic',
4249 'EC': 'Ecuador',
4250 'EG': 'Egypt',
4251 'SV': 'El Salvador',
4252 'GQ': 'Equatorial Guinea',
4253 'ER': 'Eritrea',
4254 'EE': 'Estonia',
4255 'ET': 'Ethiopia',
4256 'FK': 'Falkland Islands (Malvinas)',
4257 'FO': 'Faroe Islands',
4258 'FJ': 'Fiji',
4259 'FI': 'Finland',
4260 'FR': 'France',
4261 'GF': 'French Guiana',
4262 'PF': 'French Polynesia',
4263 'TF': 'French Southern Territories',
4264 'GA': 'Gabon',
4265 'GM': 'Gambia',
4266 'GE': 'Georgia',
4267 'DE': 'Germany',
4268 'GH': 'Ghana',
4269 'GI': 'Gibraltar',
4270 'GR': 'Greece',
4271 'GL': 'Greenland',
4272 'GD': 'Grenada',
4273 'GP': 'Guadeloupe',
4274 'GU': 'Guam',
4275 'GT': 'Guatemala',
4276 'GG': 'Guernsey',
4277 'GN': 'Guinea',
4278 'GW': 'Guinea-Bissau',
4279 'GY': 'Guyana',
4280 'HT': 'Haiti',
4281 'HM': 'Heard Island and McDonald Islands',
4282 'VA': 'Holy See (Vatican City State)',
4283 'HN': 'Honduras',
4284 'HK': 'Hong Kong',
4285 'HU': 'Hungary',
4286 'IS': 'Iceland',
4287 'IN': 'India',
4288 'ID': 'Indonesia',
4289 'IR': 'Iran, Islamic Republic of',
4290 'IQ': 'Iraq',
4291 'IE': 'Ireland',
4292 'IM': 'Isle of Man',
4293 'IL': 'Israel',
4294 'IT': 'Italy',
4295 'JM': 'Jamaica',
4296 'JP': 'Japan',
4297 'JE': 'Jersey',
4298 'JO': 'Jordan',
4299 'KZ': 'Kazakhstan',
4300 'KE': 'Kenya',
4301 'KI': 'Kiribati',
4302 'KP': 'Korea, Democratic People\'s Republic of',
4303 'KR': 'Korea, Republic of',
4304 'KW': 'Kuwait',
4305 'KG': 'Kyrgyzstan',
4306 'LA': 'Lao People\'s Democratic Republic',
4307 'LV': 'Latvia',
4308 'LB': 'Lebanon',
4309 'LS': 'Lesotho',
4310 'LR': 'Liberia',
4311 'LY': 'Libya',
4312 'LI': 'Liechtenstein',
4313 'LT': 'Lithuania',
4314 'LU': 'Luxembourg',
4315 'MO': 'Macao',
4316 'MK': 'Macedonia, the Former Yugoslav Republic of',
4317 'MG': 'Madagascar',
4318 'MW': 'Malawi',
4319 'MY': 'Malaysia',
4320 'MV': 'Maldives',
4321 'ML': 'Mali',
4322 'MT': 'Malta',
4323 'MH': 'Marshall Islands',
4324 'MQ': 'Martinique',
4325 'MR': 'Mauritania',
4326 'MU': 'Mauritius',
4327 'YT': 'Mayotte',
4328 'MX': 'Mexico',
4329 'FM': 'Micronesia, Federated States of',
4330 'MD': 'Moldova, Republic of',
4331 'MC': 'Monaco',
4332 'MN': 'Mongolia',
4333 'ME': 'Montenegro',
4334 'MS': 'Montserrat',
4335 'MA': 'Morocco',
4336 'MZ': 'Mozambique',
4337 'MM': 'Myanmar',
4338 'NA': 'Namibia',
4339 'NR': 'Nauru',
4340 'NP': 'Nepal',
4341 'NL': 'Netherlands',
4342 'NC': 'New Caledonia',
4343 'NZ': 'New Zealand',
4344 'NI': 'Nicaragua',
4345 'NE': 'Niger',
4346 'NG': 'Nigeria',
4347 'NU': 'Niue',
4348 'NF': 'Norfolk Island',
4349 'MP': 'Northern Mariana Islands',
4350 'NO': 'Norway',
4351 'OM': 'Oman',
4352 'PK': 'Pakistan',
4353 'PW': 'Palau',
4354 'PS': 'Palestine, State of',
4355 'PA': 'Panama',
4356 'PG': 'Papua New Guinea',
4357 'PY': 'Paraguay',
4358 'PE': 'Peru',
4359 'PH': 'Philippines',
4360 'PN': 'Pitcairn',
4361 'PL': 'Poland',
4362 'PT': 'Portugal',
4363 'PR': 'Puerto Rico',
4364 'QA': 'Qatar',
4365 'RE': 'Réunion',
4366 'RO': 'Romania',
4367 'RU': 'Russian Federation',
4368 'RW': 'Rwanda',
4369 'BL': 'Saint Barthélemy',
4370 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4371 'KN': 'Saint Kitts and Nevis',
4372 'LC': 'Saint Lucia',
4373 'MF': 'Saint Martin (French part)',
4374 'PM': 'Saint Pierre and Miquelon',
4375 'VC': 'Saint Vincent and the Grenadines',
4376 'WS': 'Samoa',
4377 'SM': 'San Marino',
4378 'ST': 'Sao Tome and Principe',
4379 'SA': 'Saudi Arabia',
4380 'SN': 'Senegal',
4381 'RS': 'Serbia',
4382 'SC': 'Seychelles',
4383 'SL': 'Sierra Leone',
4384 'SG': 'Singapore',
4385 'SX': 'Sint Maarten (Dutch part)',
4386 'SK': 'Slovakia',
4387 'SI': 'Slovenia',
4388 'SB': 'Solomon Islands',
4389 'SO': 'Somalia',
4390 'ZA': 'South Africa',
4391 'GS': 'South Georgia and the South Sandwich Islands',
4392 'SS': 'South Sudan',
4393 'ES': 'Spain',
4394 'LK': 'Sri Lanka',
4395 'SD': 'Sudan',
4396 'SR': 'Suriname',
4397 'SJ': 'Svalbard and Jan Mayen',
4398 'SZ': 'Swaziland',
4399 'SE': 'Sweden',
4400 'CH': 'Switzerland',
4401 'SY': 'Syrian Arab Republic',
4402 'TW': 'Taiwan, Province of China',
4403 'TJ': 'Tajikistan',
4404 'TZ': 'Tanzania, United Republic of',
4405 'TH': 'Thailand',
4406 'TL': 'Timor-Leste',
4407 'TG': 'Togo',
4408 'TK': 'Tokelau',
4409 'TO': 'Tonga',
4410 'TT': 'Trinidad and Tobago',
4411 'TN': 'Tunisia',
4412 'TR': 'Turkey',
4413 'TM': 'Turkmenistan',
4414 'TC': 'Turks and Caicos Islands',
4415 'TV': 'Tuvalu',
4416 'UG': 'Uganda',
4417 'UA': 'Ukraine',
4418 'AE': 'United Arab Emirates',
4419 'GB': 'United Kingdom',
4420 'US': 'United States',
4421 'UM': 'United States Minor Outlying Islands',
4422 'UY': 'Uruguay',
4423 'UZ': 'Uzbekistan',
4424 'VU': 'Vanuatu',
4425 'VE': 'Venezuela, Bolivarian Republic of',
4426 'VN': 'Viet Nam',
4427 'VG': 'Virgin Islands, British',
4428 'VI': 'Virgin Islands, U.S.',
4429 'WF': 'Wallis and Futuna',
4430 'EH': 'Western Sahara',
4431 'YE': 'Yemen',
4432 'ZM': 'Zambia',
4433 'ZW': 'Zimbabwe',
4434 # Not ISO 3166 codes, but used for IP blocks
4435 'AP': 'Asia/Pacific Region',
4436 'EU': 'Europe',
4437 }
4438
4439 @classmethod
4440 def short2full(cls, code):
4441 """Convert an ISO 3166-2 country code to the corresponding full name"""
4442 return cls._country_map.get(code.upper())
4443
4444
4445 class GeoUtils:
4446 # Major IPv4 address blocks per country
4447 _country_ip_map = {
4448 'AD': '46.172.224.0/19',
4449 'AE': '94.200.0.0/13',
4450 'AF': '149.54.0.0/17',
4451 'AG': '209.59.64.0/18',
4452 'AI': '204.14.248.0/21',
4453 'AL': '46.99.0.0/16',
4454 'AM': '46.70.0.0/15',
4455 'AO': '105.168.0.0/13',
4456 'AP': '182.50.184.0/21',
4457 'AQ': '23.154.160.0/24',
4458 'AR': '181.0.0.0/12',
4459 'AS': '202.70.112.0/20',
4460 'AT': '77.116.0.0/14',
4461 'AU': '1.128.0.0/11',
4462 'AW': '181.41.0.0/18',
4463 'AX': '185.217.4.0/22',
4464 'AZ': '5.197.0.0/16',
4465 'BA': '31.176.128.0/17',
4466 'BB': '65.48.128.0/17',
4467 'BD': '114.130.0.0/16',
4468 'BE': '57.0.0.0/8',
4469 'BF': '102.178.0.0/15',
4470 'BG': '95.42.0.0/15',
4471 'BH': '37.131.0.0/17',
4472 'BI': '154.117.192.0/18',
4473 'BJ': '137.255.0.0/16',
4474 'BL': '185.212.72.0/23',
4475 'BM': '196.12.64.0/18',
4476 'BN': '156.31.0.0/16',
4477 'BO': '161.56.0.0/16',
4478 'BQ': '161.0.80.0/20',
4479 'BR': '191.128.0.0/12',
4480 'BS': '24.51.64.0/18',
4481 'BT': '119.2.96.0/19',
4482 'BW': '168.167.0.0/16',
4483 'BY': '178.120.0.0/13',
4484 'BZ': '179.42.192.0/18',
4485 'CA': '99.224.0.0/11',
4486 'CD': '41.243.0.0/16',
4487 'CF': '197.242.176.0/21',
4488 'CG': '160.113.0.0/16',
4489 'CH': '85.0.0.0/13',
4490 'CI': '102.136.0.0/14',
4491 'CK': '202.65.32.0/19',
4492 'CL': '152.172.0.0/14',
4493 'CM': '102.244.0.0/14',
4494 'CN': '36.128.0.0/10',
4495 'CO': '181.240.0.0/12',
4496 'CR': '201.192.0.0/12',
4497 'CU': '152.206.0.0/15',
4498 'CV': '165.90.96.0/19',
4499 'CW': '190.88.128.0/17',
4500 'CY': '31.153.0.0/16',
4501 'CZ': '88.100.0.0/14',
4502 'DE': '53.0.0.0/8',
4503 'DJ': '197.241.0.0/17',
4504 'DK': '87.48.0.0/12',
4505 'DM': '192.243.48.0/20',
4506 'DO': '152.166.0.0/15',
4507 'DZ': '41.96.0.0/12',
4508 'EC': '186.68.0.0/15',
4509 'EE': '90.190.0.0/15',
4510 'EG': '156.160.0.0/11',
4511 'ER': '196.200.96.0/20',
4512 'ES': '88.0.0.0/11',
4513 'ET': '196.188.0.0/14',
4514 'EU': '2.16.0.0/13',
4515 'FI': '91.152.0.0/13',
4516 'FJ': '144.120.0.0/16',
4517 'FK': '80.73.208.0/21',
4518 'FM': '119.252.112.0/20',
4519 'FO': '88.85.32.0/19',
4520 'FR': '90.0.0.0/9',
4521 'GA': '41.158.0.0/15',
4522 'GB': '25.0.0.0/8',
4523 'GD': '74.122.88.0/21',
4524 'GE': '31.146.0.0/16',
4525 'GF': '161.22.64.0/18',
4526 'GG': '62.68.160.0/19',
4527 'GH': '154.160.0.0/12',
4528 'GI': '95.164.0.0/16',
4529 'GL': '88.83.0.0/19',
4530 'GM': '160.182.0.0/15',
4531 'GN': '197.149.192.0/18',
4532 'GP': '104.250.0.0/19',
4533 'GQ': '105.235.224.0/20',
4534 'GR': '94.64.0.0/13',
4535 'GT': '168.234.0.0/16',
4536 'GU': '168.123.0.0/16',
4537 'GW': '197.214.80.0/20',
4538 'GY': '181.41.64.0/18',
4539 'HK': '113.252.0.0/14',
4540 'HN': '181.210.0.0/16',
4541 'HR': '93.136.0.0/13',
4542 'HT': '148.102.128.0/17',
4543 'HU': '84.0.0.0/14',
4544 'ID': '39.192.0.0/10',
4545 'IE': '87.32.0.0/12',
4546 'IL': '79.176.0.0/13',
4547 'IM': '5.62.80.0/20',
4548 'IN': '117.192.0.0/10',
4549 'IO': '203.83.48.0/21',
4550 'IQ': '37.236.0.0/14',
4551 'IR': '2.176.0.0/12',
4552 'IS': '82.221.0.0/16',
4553 'IT': '79.0.0.0/10',
4554 'JE': '87.244.64.0/18',
4555 'JM': '72.27.0.0/17',
4556 'JO': '176.29.0.0/16',
4557 'JP': '133.0.0.0/8',
4558 'KE': '105.48.0.0/12',
4559 'KG': '158.181.128.0/17',
4560 'KH': '36.37.128.0/17',
4561 'KI': '103.25.140.0/22',
4562 'KM': '197.255.224.0/20',
4563 'KN': '198.167.192.0/19',
4564 'KP': '175.45.176.0/22',
4565 'KR': '175.192.0.0/10',
4566 'KW': '37.36.0.0/14',
4567 'KY': '64.96.0.0/15',
4568 'KZ': '2.72.0.0/13',
4569 'LA': '115.84.64.0/18',
4570 'LB': '178.135.0.0/16',
4571 'LC': '24.92.144.0/20',
4572 'LI': '82.117.0.0/19',
4573 'LK': '112.134.0.0/15',
4574 'LR': '102.183.0.0/16',
4575 'LS': '129.232.0.0/17',
4576 'LT': '78.56.0.0/13',
4577 'LU': '188.42.0.0/16',
4578 'LV': '46.109.0.0/16',
4579 'LY': '41.252.0.0/14',
4580 'MA': '105.128.0.0/11',
4581 'MC': '88.209.64.0/18',
4582 'MD': '37.246.0.0/16',
4583 'ME': '178.175.0.0/17',
4584 'MF': '74.112.232.0/21',
4585 'MG': '154.126.0.0/17',
4586 'MH': '117.103.88.0/21',
4587 'MK': '77.28.0.0/15',
4588 'ML': '154.118.128.0/18',
4589 'MM': '37.111.0.0/17',
4590 'MN': '49.0.128.0/17',
4591 'MO': '60.246.0.0/16',
4592 'MP': '202.88.64.0/20',
4593 'MQ': '109.203.224.0/19',
4594 'MR': '41.188.64.0/18',
4595 'MS': '208.90.112.0/22',
4596 'MT': '46.11.0.0/16',
4597 'MU': '105.16.0.0/12',
4598 'MV': '27.114.128.0/18',
4599 'MW': '102.70.0.0/15',
4600 'MX': '187.192.0.0/11',
4601 'MY': '175.136.0.0/13',
4602 'MZ': '197.218.0.0/15',
4603 'NA': '41.182.0.0/16',
4604 'NC': '101.101.0.0/18',
4605 'NE': '197.214.0.0/18',
4606 'NF': '203.17.240.0/22',
4607 'NG': '105.112.0.0/12',
4608 'NI': '186.76.0.0/15',
4609 'NL': '145.96.0.0/11',
4610 'NO': '84.208.0.0/13',
4611 'NP': '36.252.0.0/15',
4612 'NR': '203.98.224.0/19',
4613 'NU': '49.156.48.0/22',
4614 'NZ': '49.224.0.0/14',
4615 'OM': '5.36.0.0/15',
4616 'PA': '186.72.0.0/15',
4617 'PE': '186.160.0.0/14',
4618 'PF': '123.50.64.0/18',
4619 'PG': '124.240.192.0/19',
4620 'PH': '49.144.0.0/13',
4621 'PK': '39.32.0.0/11',
4622 'PL': '83.0.0.0/11',
4623 'PM': '70.36.0.0/20',
4624 'PR': '66.50.0.0/16',
4625 'PS': '188.161.0.0/16',
4626 'PT': '85.240.0.0/13',
4627 'PW': '202.124.224.0/20',
4628 'PY': '181.120.0.0/14',
4629 'QA': '37.210.0.0/15',
4630 'RE': '102.35.0.0/16',
4631 'RO': '79.112.0.0/13',
4632 'RS': '93.86.0.0/15',
4633 'RU': '5.136.0.0/13',
4634 'RW': '41.186.0.0/16',
4635 'SA': '188.48.0.0/13',
4636 'SB': '202.1.160.0/19',
4637 'SC': '154.192.0.0/11',
4638 'SD': '102.120.0.0/13',
4639 'SE': '78.64.0.0/12',
4640 'SG': '8.128.0.0/10',
4641 'SI': '188.196.0.0/14',
4642 'SK': '78.98.0.0/15',
4643 'SL': '102.143.0.0/17',
4644 'SM': '89.186.32.0/19',
4645 'SN': '41.82.0.0/15',
4646 'SO': '154.115.192.0/18',
4647 'SR': '186.179.128.0/17',
4648 'SS': '105.235.208.0/21',
4649 'ST': '197.159.160.0/19',
4650 'SV': '168.243.0.0/16',
4651 'SX': '190.102.0.0/20',
4652 'SY': '5.0.0.0/16',
4653 'SZ': '41.84.224.0/19',
4654 'TC': '65.255.48.0/20',
4655 'TD': '154.68.128.0/19',
4656 'TG': '196.168.0.0/14',
4657 'TH': '171.96.0.0/13',
4658 'TJ': '85.9.128.0/18',
4659 'TK': '27.96.24.0/21',
4660 'TL': '180.189.160.0/20',
4661 'TM': '95.85.96.0/19',
4662 'TN': '197.0.0.0/11',
4663 'TO': '175.176.144.0/21',
4664 'TR': '78.160.0.0/11',
4665 'TT': '186.44.0.0/15',
4666 'TV': '202.2.96.0/19',
4667 'TW': '120.96.0.0/11',
4668 'TZ': '156.156.0.0/14',
4669 'UA': '37.52.0.0/14',
4670 'UG': '102.80.0.0/13',
4671 'US': '6.0.0.0/8',
4672 'UY': '167.56.0.0/13',
4673 'UZ': '84.54.64.0/18',
4674 'VA': '212.77.0.0/19',
4675 'VC': '207.191.240.0/21',
4676 'VE': '186.88.0.0/13',
4677 'VG': '66.81.192.0/20',
4678 'VI': '146.226.0.0/16',
4679 'VN': '14.160.0.0/11',
4680 'VU': '202.80.32.0/20',
4681 'WF': '117.20.32.0/21',
4682 'WS': '202.4.32.0/19',
4683 'YE': '134.35.0.0/16',
4684 'YT': '41.242.116.0/22',
4685 'ZA': '41.0.0.0/11',
4686 'ZM': '102.144.0.0/13',
4687 'ZW': '102.177.192.0/18',
4688 }
4689
4690 @classmethod
4691 def random_ipv4(cls, code_or_block):
4692 if len(code_or_block) == 2:
4693 block = cls._country_ip_map.get(code_or_block.upper())
4694 if not block:
4695 return None
4696 else:
4697 block = code_or_block
4698 addr, preflen = block.split('/')
4699 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4700 addr_max = addr_min | (0xffffffff >> int(preflen))
4701 return str(socket.inet_ntoa(
4702 struct.pack('!L', random.randint(addr_min, addr_max))))
4703
4704
4705 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4706 def __init__(self, proxies=None):
4707 # Set default handlers
4708 for type in ('http', 'https'):
4709 setattr(self, '%s_open' % type,
4710 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4711 meth(r, proxy, type))
4712 urllib.request.ProxyHandler.__init__(self, proxies)
4713
4714 def proxy_open(self, req, proxy, type):
4715 req_proxy = req.headers.get('Ytdl-request-proxy')
4716 if req_proxy is not None:
4717 proxy = req_proxy
4718 del req.headers['Ytdl-request-proxy']
4719
4720 if proxy == '__noproxy__':
4721 return None # No Proxy
4722 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4723 req.add_header('Ytdl-socks-proxy', proxy)
4724 # yt-dlp's http/https handlers do wrapping the socket with socks
4725 return None
4726 return urllib.request.ProxyHandler.proxy_open(
4727 self, req, proxy, type)
4728
4729
4730 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4731 # released into Public Domain
4732 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4733
4734 def long_to_bytes(n, blocksize=0):
4735 """long_to_bytes(n:long, blocksize:int) : string
4736 Convert a long integer to a byte string.
4737
4738 If optional blocksize is given and greater than zero, pad the front of the
4739 byte string with binary zeros so that the length is a multiple of
4740 blocksize.
4741 """
4742 # after much testing, this algorithm was deemed to be the fastest
4743 s = b''
4744 n = int(n)
4745 while n > 0:
4746 s = struct.pack('>I', n & 0xffffffff) + s
4747 n = n >> 32
4748 # strip off leading zeros
4749 for i in range(len(s)):
4750 if s[i] != b'\000'[0]:
4751 break
4752 else:
4753 # only happens when n == 0
4754 s = b'\000'
4755 i = 0
4756 s = s[i:]
4757 # add back some pad bytes. this could be done more efficiently w.r.t. the
4758 # de-padding being done above, but sigh...
4759 if blocksize > 0 and len(s) % blocksize:
4760 s = (blocksize - len(s) % blocksize) * b'\000' + s
4761 return s
4762
4763
4764 def bytes_to_long(s):
4765 """bytes_to_long(string) : long
4766 Convert a byte string to a long integer.
4767
4768 This is (essentially) the inverse of long_to_bytes().
4769 """
4770 acc = 0
4771 length = len(s)
4772 if length % 4:
4773 extra = (4 - length % 4)
4774 s = b'\000' * extra + s
4775 length = length + extra
4776 for i in range(0, length, 4):
4777 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4778 return acc
4779
4780
4781 def ohdave_rsa_encrypt(data, exponent, modulus):
4782 '''
4783 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4784
4785 Input:
4786 data: data to encrypt, bytes-like object
4787 exponent, modulus: parameter e and N of RSA algorithm, both integer
4788 Output: hex string of encrypted data
4789
4790 Limitation: supports one block encryption only
4791 '''
4792
4793 payload = int(binascii.hexlify(data[::-1]), 16)
4794 encrypted = pow(payload, exponent, modulus)
4795 return '%x' % encrypted
4796
4797
4798 def pkcs1pad(data, length):
4799 """
4800 Padding input data with PKCS#1 scheme
4801
4802 @param {int[]} data input data
4803 @param {int} length target length
4804 @returns {int[]} padded data
4805 """
4806 if len(data) > length - 11:
4807 raise ValueError('Input data too long for PKCS#1 padding')
4808
4809 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4810 return [0, 2] + pseudo_random + [0] + data
4811
4812
4813 def _base_n_table(n, table):
4814 if not table and not n:
4815 raise ValueError('Either table or n must be specified')
4816 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4817
4818 if n and n != len(table):
4819 raise ValueError(f'base {n} exceeds table length {len(table)}')
4820 return table
4821
4822
4823 def encode_base_n(num, n=None, table=None):
4824 """Convert given int to a base-n string"""
4825 table = _base_n_table(n, table)
4826 if not num:
4827 return table[0]
4828
4829 result, base = '', len(table)
4830 while num:
4831 result = table[num % base] + result
4832 num = num // base
4833 return result
4834
4835
4836 def decode_base_n(string, n=None, table=None):
4837 """Convert given base-n string to int"""
4838 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4839 result, base = 0, len(table)
4840 for char in string:
4841 result = result * base + table[char]
4842 return result
4843
4844
4845 def decode_base(value, digits):
4846 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4847 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4848 return decode_base_n(value, table=digits)
4849
4850
4851 def decode_packed_codes(code):
4852 mobj = re.search(PACKED_CODES_RE, code)
4853 obfuscated_code, base, count, symbols = mobj.groups()
4854 base = int(base)
4855 count = int(count)
4856 symbols = symbols.split('|')
4857 symbol_table = {}
4858
4859 while count:
4860 count -= 1
4861 base_n_count = encode_base_n(count, base)
4862 symbol_table[base_n_count] = symbols[count] or base_n_count
4863
4864 return re.sub(
4865 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4866 obfuscated_code)
4867
4868
4869 def caesar(s, alphabet, shift):
4870 if shift == 0:
4871 return s
4872 l = len(alphabet)
4873 return ''.join(
4874 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4875 for c in s)
4876
4877
4878 def rot47(s):
4879 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4880
4881
4882 def parse_m3u8_attributes(attrib):
4883 info = {}
4884 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4885 if val.startswith('"'):
4886 val = val[1:-1]
4887 info[key] = val
4888 return info
4889
4890
4891 def urshift(val, n):
4892 return val >> n if val >= 0 else (val + 0x100000000) >> n
4893
4894
4895 # Based on png2str() written by @gdkchan and improved by @yokrysty
4896 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4897 def decode_png(png_data):
4898 # Reference: https://www.w3.org/TR/PNG/
4899 header = png_data[8:]
4900
4901 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4902 raise OSError('Not a valid PNG file.')
4903
4904 int_map = {1: '>B', 2: '>H', 4: '>I'}
4905 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4906
4907 chunks = []
4908
4909 while header:
4910 length = unpack_integer(header[:4])
4911 header = header[4:]
4912
4913 chunk_type = header[:4]
4914 header = header[4:]
4915
4916 chunk_data = header[:length]
4917 header = header[length:]
4918
4919 header = header[4:] # Skip CRC
4920
4921 chunks.append({
4922 'type': chunk_type,
4923 'length': length,
4924 'data': chunk_data
4925 })
4926
4927 ihdr = chunks[0]['data']
4928
4929 width = unpack_integer(ihdr[:4])
4930 height = unpack_integer(ihdr[4:8])
4931
4932 idat = b''
4933
4934 for chunk in chunks:
4935 if chunk['type'] == b'IDAT':
4936 idat += chunk['data']
4937
4938 if not idat:
4939 raise OSError('Unable to read PNG data.')
4940
4941 decompressed_data = bytearray(zlib.decompress(idat))
4942
4943 stride = width * 3
4944 pixels = []
4945
4946 def _get_pixel(idx):
4947 x = idx % stride
4948 y = idx // stride
4949 return pixels[y][x]
4950
4951 for y in range(height):
4952 basePos = y * (1 + stride)
4953 filter_type = decompressed_data[basePos]
4954
4955 current_row = []
4956
4957 pixels.append(current_row)
4958
4959 for x in range(stride):
4960 color = decompressed_data[1 + basePos + x]
4961 basex = y * stride + x
4962 left = 0
4963 up = 0
4964
4965 if x > 2:
4966 left = _get_pixel(basex - 3)
4967 if y > 0:
4968 up = _get_pixel(basex - stride)
4969
4970 if filter_type == 1: # Sub
4971 color = (color + left) & 0xff
4972 elif filter_type == 2: # Up
4973 color = (color + up) & 0xff
4974 elif filter_type == 3: # Average
4975 color = (color + ((left + up) >> 1)) & 0xff
4976 elif filter_type == 4: # Paeth
4977 a = left
4978 b = up
4979 c = 0
4980
4981 if x > 2 and y > 0:
4982 c = _get_pixel(basex - stride - 3)
4983
4984 p = a + b - c
4985
4986 pa = abs(p - a)
4987 pb = abs(p - b)
4988 pc = abs(p - c)
4989
4990 if pa <= pb and pa <= pc:
4991 color = (color + a) & 0xff
4992 elif pb <= pc:
4993 color = (color + b) & 0xff
4994 else:
4995 color = (color + c) & 0xff
4996
4997 current_row.append(color)
4998
4999 return width, height, pixels
5000
5001
5002 def write_xattr(path, key, value):
5003 # Windows: Write xattrs to NTFS Alternate Data Streams:
5004 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5005 if compat_os_name == 'nt':
5006 assert ':' not in key
5007 assert os.path.exists(path)
5008
5009 try:
5010 with open(f'{path}:{key}', 'wb') as f:
5011 f.write(value)
5012 except OSError as e:
5013 raise XAttrMetadataError(e.errno, e.strerror)
5014 return
5015
5016 # UNIX Method 1. Use xattrs/pyxattrs modules
5017
5018 setxattr = None
5019 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5020 # Unicode arguments are not supported in pyxattr until version 0.5.0
5021 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5022 if version_tuple(xattr.__version__) >= (0, 5, 0):
5023 setxattr = xattr.set
5024 elif xattr:
5025 setxattr = xattr.setxattr
5026
5027 if setxattr:
5028 try:
5029 setxattr(path, key, value)
5030 except OSError as e:
5031 raise XAttrMetadataError(e.errno, e.strerror)
5032 return
5033
5034 # UNIX Method 2. Use setfattr/xattr executables
5035 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5036 else 'xattr' if check_executable('xattr', ['-h']) else None)
5037 if not exe:
5038 raise XAttrUnavailableError(
5039 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5040 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5041
5042 value = value.decode()
5043 try:
5044 _, stderr, returncode = Popen.run(
5045 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5046 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5047 except OSError as e:
5048 raise XAttrMetadataError(e.errno, e.strerror)
5049 if returncode:
5050 raise XAttrMetadataError(returncode, stderr)
5051
5052
5053 def random_birthday(year_field, month_field, day_field):
5054 start_date = datetime.date(1950, 1, 1)
5055 end_date = datetime.date(1995, 12, 31)
5056 offset = random.randint(0, (end_date - start_date).days)
5057 random_date = start_date + datetime.timedelta(offset)
5058 return {
5059 year_field: str(random_date.year),
5060 month_field: str(random_date.month),
5061 day_field: str(random_date.day),
5062 }
5063
5064
5065 # Templates for internet shortcut files, which are plain text files.
5066 DOT_URL_LINK_TEMPLATE = '''\
5067 [InternetShortcut]
5068 URL=%(url)s
5069 '''
5070
5071 DOT_WEBLOC_LINK_TEMPLATE = '''\
5072 <?xml version="1.0" encoding="UTF-8"?>
5073 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5074 <plist version="1.0">
5075 <dict>
5076 \t<key>URL</key>
5077 \t<string>%(url)s</string>
5078 </dict>
5079 </plist>
5080 '''
5081
5082 DOT_DESKTOP_LINK_TEMPLATE = '''\
5083 [Desktop Entry]
5084 Encoding=UTF-8
5085 Name=%(filename)s
5086 Type=Link
5087 URL=%(url)s
5088 Icon=text-html
5089 '''
5090
5091 LINK_TEMPLATES = {
5092 'url': DOT_URL_LINK_TEMPLATE,
5093 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5094 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5095 }
5096
5097
5098 def iri_to_uri(iri):
5099 """
5100 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5101
5102 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5103 """
5104
5105 iri_parts = urllib.parse.urlparse(iri)
5106
5107 if '[' in iri_parts.netloc:
5108 raise ValueError('IPv6 URIs are not, yet, supported.')
5109 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5110
5111 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5112
5113 net_location = ''
5114 if iri_parts.username:
5115 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5116 if iri_parts.password is not None:
5117 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5118 net_location += '@'
5119
5120 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5121 # The 'idna' encoding produces ASCII text.
5122 if iri_parts.port is not None and iri_parts.port != 80:
5123 net_location += ':' + str(iri_parts.port)
5124
5125 return urllib.parse.urlunparse(
5126 (iri_parts.scheme,
5127 net_location,
5128
5129 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5130
5131 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5132 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5133
5134 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5135 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5136
5137 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5138
5139 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5140
5141
5142 def to_high_limit_path(path):
5143 if sys.platform in ['win32', 'cygwin']:
5144 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5145 return '\\\\?\\' + os.path.abspath(path)
5146
5147 return path
5148
5149
5150 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5151 val = traverse_obj(obj, *variadic(field))
5152 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5153 return default
5154 return template % func(val)
5155
5156
5157 def clean_podcast_url(url):
5158 return re.sub(r'''(?x)
5159 (?:
5160 (?:
5161 chtbl\.com/track|
5162 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5163 play\.podtrac\.com
5164 )/[^/]+|
5165 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5166 flex\.acast\.com|
5167 pd(?:
5168 cn\.co| # https://podcorn.com/analytics-prefix/
5169 st\.fm # https://podsights.com/docs/
5170 )/e
5171 )/''', '', url)
5172
5173
5174 _HEX_TABLE = '0123456789abcdef'
5175
5176
5177 def random_uuidv4():
5178 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5179
5180
5181 def make_dir(path, to_screen=None):
5182 try:
5183 dn = os.path.dirname(path)
5184 if dn and not os.path.exists(dn):
5185 os.makedirs(dn)
5186 return True
5187 except OSError as err:
5188 if callable(to_screen) is not None:
5189 to_screen('unable to create directory ' + error_to_compat_str(err))
5190 return False
5191
5192
5193 def get_executable_path():
5194 from .update import _get_variant_and_executable_path
5195
5196 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5197
5198
5199 def load_plugins(name, suffix, namespace):
5200 classes = {}
5201 with contextlib.suppress(FileNotFoundError):
5202 plugins_spec = importlib.util.spec_from_file_location(
5203 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5204 plugins = importlib.util.module_from_spec(plugins_spec)
5205 sys.modules[plugins_spec.name] = plugins
5206 plugins_spec.loader.exec_module(plugins)
5207 for name in dir(plugins):
5208 if name in namespace:
5209 continue
5210 if not name.endswith(suffix):
5211 continue
5212 klass = getattr(plugins, name)
5213 classes[name] = namespace[name] = klass
5214 return classes
5215
5216
5217 def traverse_obj(
5218 obj, *path_list, default=None, expected_type=None, get_all=True,
5219 casesense=True, is_user_input=False, traverse_string=False):
5220 ''' Traverse nested list/dict/tuple
5221 @param path_list A list of paths which are checked one by one.
5222 Each path is a list of keys where each key is a:
5223 - None: Do nothing
5224 - string: A dictionary key
5225 - int: An index into a list
5226 - tuple: A list of keys all of which will be traversed
5227 - Ellipsis: Fetch all values in the object
5228 - Function: Takes the key and value as arguments
5229 and returns whether the key matches or not
5230 @param default Default value to return
5231 @param expected_type Only accept final value of this type (Can also be any callable)
5232 @param get_all Return all the values obtained from a path or only the first one
5233 @param casesense Whether to consider dictionary keys as case sensitive
5234 @param is_user_input Whether the keys are generated from user input. If True,
5235 strings are converted to int/slice if necessary
5236 @param traverse_string Whether to traverse inside strings. If True, any
5237 non-compatible object will also be converted into a string
5238 # TODO: Write tests
5239 '''
5240 if not casesense:
5241 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5242 path_list = (map(_lower, variadic(path)) for path in path_list)
5243
5244 def _traverse_obj(obj, path, _current_depth=0):
5245 nonlocal depth
5246 path = tuple(variadic(path))
5247 for i, key in enumerate(path):
5248 if None in (key, obj):
5249 return obj
5250 if isinstance(key, (list, tuple)):
5251 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5252 key = ...
5253 if key is ...:
5254 obj = (obj.values() if isinstance(obj, dict)
5255 else obj if isinstance(obj, (list, tuple, LazyList))
5256 else str(obj) if traverse_string else [])
5257 _current_depth += 1
5258 depth = max(depth, _current_depth)
5259 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5260 elif callable(key):
5261 if isinstance(obj, (list, tuple, LazyList)):
5262 obj = enumerate(obj)
5263 elif isinstance(obj, dict):
5264 obj = obj.items()
5265 else:
5266 if not traverse_string:
5267 return None
5268 obj = str(obj)
5269 _current_depth += 1
5270 depth = max(depth, _current_depth)
5271 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5272 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5273 obj = (obj.get(key) if casesense or (key in obj)
5274 else next((v for k, v in obj.items() if _lower(k) == key), None))
5275 else:
5276 if is_user_input:
5277 key = (int_or_none(key) if ':' not in key
5278 else slice(*map(int_or_none, key.split(':'))))
5279 if key == slice(None):
5280 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5281 if not isinstance(key, (int, slice)):
5282 return None
5283 if not isinstance(obj, (list, tuple, LazyList)):
5284 if not traverse_string:
5285 return None
5286 obj = str(obj)
5287 try:
5288 obj = obj[key]
5289 except IndexError:
5290 return None
5291 return obj
5292
5293 if isinstance(expected_type, type):
5294 type_test = lambda val: val if isinstance(val, expected_type) else None
5295 else:
5296 type_test = expected_type or IDENTITY
5297
5298 for path in path_list:
5299 depth = 0
5300 val = _traverse_obj(obj, path)
5301 if val is not None:
5302 if depth:
5303 for _ in range(depth - 1):
5304 val = itertools.chain.from_iterable(v for v in val if v is not None)
5305 val = [v for v in map(type_test, val) if v is not None]
5306 if val:
5307 return val if get_all else val[0]
5308 else:
5309 val = type_test(val)
5310 if val is not None:
5311 return val
5312 return default
5313
5314
5315 def traverse_dict(dictn, keys, casesense=True):
5316 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5317 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5318 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5319
5320
5321 def get_first(obj, keys, **kwargs):
5322 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5323
5324
5325 def variadic(x, allowed_types=(str, bytes, dict)):
5326 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5327
5328
5329 def time_seconds(**kwargs):
5330 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5331 return t.timestamp()
5332
5333
5334 # create a JSON Web Signature (jws) with HS256 algorithm
5335 # the resulting format is in JWS Compact Serialization
5336 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5337 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5338 def jwt_encode_hs256(payload_data, key, headers={}):
5339 header_data = {
5340 'alg': 'HS256',
5341 'typ': 'JWT',
5342 }
5343 if headers:
5344 header_data.update(headers)
5345 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5346 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5347 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5348 signature_b64 = base64.b64encode(h.digest())
5349 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5350 return token
5351
5352
5353 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5354 def jwt_decode_hs256(jwt):
5355 header_b64, payload_b64, signature_b64 = jwt.split('.')
5356 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5357 return payload_data
5358
5359
5360 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5361
5362
5363 @functools.cache
5364 def supports_terminal_sequences(stream):
5365 if compat_os_name == 'nt':
5366 if not WINDOWS_VT_MODE:
5367 return False
5368 elif not os.getenv('TERM'):
5369 return False
5370 try:
5371 return stream.isatty()
5372 except BaseException:
5373 return False
5374
5375
5376 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5377 if get_windows_version() < (10, 0, 10586):
5378 return
5379 global WINDOWS_VT_MODE
5380 try:
5381 Popen.run('', shell=True)
5382 except Exception:
5383 return
5384
5385 WINDOWS_VT_MODE = True
5386 supports_terminal_sequences.cache_clear()
5387
5388
5389 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5390
5391
5392 def remove_terminal_sequences(string):
5393 return _terminal_sequences_re.sub('', string)
5394
5395
5396 def number_of_digits(number):
5397 return len('%d' % number)
5398
5399
5400 def join_nonempty(*values, delim='-', from_dict=None):
5401 if from_dict is not None:
5402 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5403 return delim.join(map(str, filter(None, values)))
5404
5405
5406 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5407 """
5408 Find the largest format dimensions in terms of video width and, for each thumbnail:
5409 * Modify the URL: Match the width with the provided regex and replace with the former width
5410 * Update dimensions
5411
5412 This function is useful with video services that scale the provided thumbnails on demand
5413 """
5414 _keys = ('width', 'height')
5415 max_dimensions = max(
5416 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5417 default=(0, 0))
5418 if not max_dimensions[0]:
5419 return thumbnails
5420 return [
5421 merge_dicts(
5422 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5423 dict(zip(_keys, max_dimensions)), thumbnail)
5424 for thumbnail in thumbnails
5425 ]
5426
5427
5428 def parse_http_range(range):
5429 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5430 if not range:
5431 return None, None, None
5432 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5433 if not crg:
5434 return None, None, None
5435 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5436
5437
5438 def read_stdin(what):
5439 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5440 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5441 return sys.stdin
5442
5443
5444 def determine_file_encoding(data):
5445 """
5446 Detect the text encoding used
5447 @returns (encoding, bytes to skip)
5448 """
5449
5450 # BOM marks are given priority over declarations
5451 for bom, enc in BOMS:
5452 if data.startswith(bom):
5453 return enc, len(bom)
5454
5455 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5456 # We ignore the endianness to get a good enough match
5457 data = data.replace(b'\0', b'')
5458 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5459 return mobj.group(1).decode() if mobj else None, 0
5460
5461
5462 class Config:
5463 own_args = None
5464 parsed_args = None
5465 filename = None
5466 __initialized = False
5467
5468 def __init__(self, parser, label=None):
5469 self.parser, self.label = parser, label
5470 self._loaded_paths, self.configs = set(), []
5471
5472 def init(self, args=None, filename=None):
5473 assert not self.__initialized
5474 self.own_args, self.filename = args, filename
5475 return self.load_configs()
5476
5477 def load_configs(self):
5478 directory = ''
5479 if self.filename:
5480 location = os.path.realpath(self.filename)
5481 directory = os.path.dirname(location)
5482 if location in self._loaded_paths:
5483 return False
5484 self._loaded_paths.add(location)
5485
5486 self.__initialized = True
5487 opts, _ = self.parser.parse_known_args(self.own_args)
5488 self.parsed_args = self.own_args
5489 for location in opts.config_locations or []:
5490 if location == '-':
5491 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5492 continue
5493 location = os.path.join(directory, expand_path(location))
5494 if os.path.isdir(location):
5495 location = os.path.join(location, 'yt-dlp.conf')
5496 if not os.path.exists(location):
5497 self.parser.error(f'config location {location} does not exist')
5498 self.append_config(self.read_file(location), location)
5499 return True
5500
5501 def __str__(self):
5502 label = join_nonempty(
5503 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5504 delim=' ')
5505 return join_nonempty(
5506 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5507 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5508 delim='\n')
5509
5510 @staticmethod
5511 def read_file(filename, default=[]):
5512 try:
5513 optionf = open(filename, 'rb')
5514 except OSError:
5515 return default # silently skip if file is not present
5516 try:
5517 enc, skip = determine_file_encoding(optionf.read(512))
5518 optionf.seek(skip, io.SEEK_SET)
5519 except OSError:
5520 enc = None # silently skip read errors
5521 try:
5522 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5523 contents = optionf.read().decode(enc or preferredencoding())
5524 res = shlex.split(contents, comments=True)
5525 except Exception as err:
5526 raise ValueError(f'Unable to parse "{filename}": {err}')
5527 finally:
5528 optionf.close()
5529 return res
5530
5531 @staticmethod
5532 def hide_login_info(opts):
5533 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5534 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5535
5536 def _scrub_eq(o):
5537 m = eqre.match(o)
5538 if m:
5539 return m.group('key') + '=PRIVATE'
5540 else:
5541 return o
5542
5543 opts = list(map(_scrub_eq, opts))
5544 for idx, opt in enumerate(opts):
5545 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5546 opts[idx + 1] = 'PRIVATE'
5547 return opts
5548
5549 def append_config(self, *args, label=None):
5550 config = type(self)(self.parser, label)
5551 config._loaded_paths = self._loaded_paths
5552 if config.init(*args):
5553 self.configs.append(config)
5554
5555 @property
5556 def all_args(self):
5557 for config in reversed(self.configs):
5558 yield from config.all_args
5559 yield from self.parsed_args or []
5560
5561 def parse_known_args(self, **kwargs):
5562 return self.parser.parse_known_args(self.all_args, **kwargs)
5563
5564 def parse_args(self):
5565 return self.parser.parse_args(self.all_args)
5566
5567
5568 class WebSocketsWrapper():
5569 """Wraps websockets module to use in non-async scopes"""
5570 pool = None
5571
5572 def __init__(self, url, headers=None, connect=True):
5573 self.loop = asyncio.new_event_loop()
5574 # XXX: "loop" is deprecated
5575 self.conn = websockets.connect(
5576 url, extra_headers=headers, ping_interval=None,
5577 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5578 if connect:
5579 self.__enter__()
5580 atexit.register(self.__exit__, None, None, None)
5581
5582 def __enter__(self):
5583 if not self.pool:
5584 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5585 return self
5586
5587 def send(self, *args):
5588 self.run_with_loop(self.pool.send(*args), self.loop)
5589
5590 def recv(self, *args):
5591 return self.run_with_loop(self.pool.recv(*args), self.loop)
5592
5593 def __exit__(self, type, value, traceback):
5594 try:
5595 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5596 finally:
5597 self.loop.close()
5598 self._cancel_all_tasks(self.loop)
5599
5600 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5601 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5602 @staticmethod
5603 def run_with_loop(main, loop):
5604 if not asyncio.iscoroutine(main):
5605 raise ValueError(f'a coroutine was expected, got {main!r}')
5606
5607 try:
5608 return loop.run_until_complete(main)
5609 finally:
5610 loop.run_until_complete(loop.shutdown_asyncgens())
5611 if hasattr(loop, 'shutdown_default_executor'):
5612 loop.run_until_complete(loop.shutdown_default_executor())
5613
5614 @staticmethod
5615 def _cancel_all_tasks(loop):
5616 to_cancel = asyncio.all_tasks(loop)
5617
5618 if not to_cancel:
5619 return
5620
5621 for task in to_cancel:
5622 task.cancel()
5623
5624 # XXX: "loop" is removed in python 3.10+
5625 loop.run_until_complete(
5626 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5627
5628 for task in to_cancel:
5629 if task.cancelled():
5630 continue
5631 if task.exception() is not None:
5632 loop.call_exception_handler({
5633 'message': 'unhandled exception during asyncio.run() shutdown',
5634 'exception': task.exception(),
5635 'task': task,
5636 })
5637
5638
5639 def merge_headers(*dicts):
5640 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5641 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5642
5643
5644 def cached_method(f):
5645 """Cache a method"""
5646 signature = inspect.signature(f)
5647
5648 @functools.wraps(f)
5649 def wrapper(self, *args, **kwargs):
5650 bound_args = signature.bind(self, *args, **kwargs)
5651 bound_args.apply_defaults()
5652 key = tuple(bound_args.arguments.values())
5653
5654 if not hasattr(self, '__cached_method__cache'):
5655 self.__cached_method__cache = {}
5656 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5657 if key not in cache:
5658 cache[key] = f(self, *args, **kwargs)
5659 return cache[key]
5660 return wrapper
5661
5662
5663 class classproperty:
5664 """property access for class methods"""
5665
5666 def __init__(self, func):
5667 functools.update_wrapper(self, func)
5668 self.func = func
5669
5670 def __get__(self, _, cls):
5671 return self.func(cls)
5672
5673
5674 class Namespace(types.SimpleNamespace):
5675 """Immutable namespace"""
5676
5677 def __iter__(self):
5678 return iter(self.__dict__.values())
5679
5680 @property
5681 def items_(self):
5682 return self.__dict__.items()
5683
5684
5685 MEDIA_EXTENSIONS = Namespace(
5686 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5687 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5688 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5689 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5690 thumbnails=('jpg', 'png', 'webp'),
5691 storyboards=('mhtml', ),
5692 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5693 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5694 )
5695 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5696 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5697
5698 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5699
5700
5701 class RetryManager:
5702 """Usage:
5703 for retry in RetryManager(...):
5704 try:
5705 ...
5706 except SomeException as err:
5707 retry.error = err
5708 continue
5709 """
5710 attempt, _error = 0, None
5711
5712 def __init__(self, _retries, _error_callback, **kwargs):
5713 self.retries = _retries or 0
5714 self.error_callback = functools.partial(_error_callback, **kwargs)
5715
5716 def _should_retry(self):
5717 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5718
5719 @property
5720 def error(self):
5721 if self._error is NO_DEFAULT:
5722 return None
5723 return self._error
5724
5725 @error.setter
5726 def error(self, value):
5727 self._error = value
5728
5729 def __iter__(self):
5730 while self._should_retry():
5731 self.error = NO_DEFAULT
5732 self.attempt += 1
5733 yield self
5734 if self.error:
5735 self.error_callback(self.error, self.attempt, self.retries)
5736
5737 @staticmethod
5738 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5739 """Utility function for reporting retries"""
5740 if count > retries:
5741 if error:
5742 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5743 raise e
5744
5745 if not count:
5746 return warn(e)
5747 elif isinstance(e, ExtractorError):
5748 e = remove_end(str(e.cause) or e.orig_msg, '.')
5749 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5750
5751 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5752 if delay:
5753 info(f'Sleeping {delay:.2f} seconds ...')
5754 time.sleep(delay)
5755
5756
5757 def make_archive_id(ie, video_id):
5758 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5759 return f'{ie_key.lower()} {video_id}'
5760
5761
5762 # Deprecated
5763 has_certifi = bool(certifi)
5764 has_websockets = bool(websockets)