]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
82eb30af6dc5f7b84dc254ca36c2276e442a4fe6
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import ctypes
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import functools
15 import gzip
16 import hashlib
17 import hmac
18 import importlib.util
19 import io
20 import itertools
21 import json
22 import locale
23 import math
24 import mimetypes
25 import operator
26 import os
27 import platform
28 import random
29 import re
30 import shlex
31 import socket
32 import ssl
33 import subprocess
34 import sys
35 import tempfile
36 import time
37 import traceback
38 import urllib.parse
39 import xml.etree.ElementTree
40 import zlib
41
42 from .compat import (
43 asyncio,
44 compat_chr,
45 compat_cookiejar,
46 compat_etree_fromstring,
47 compat_expanduser,
48 compat_html_entities,
49 compat_html_entities_html5,
50 compat_HTMLParseError,
51 compat_HTMLParser,
52 compat_http_client,
53 compat_HTTPError,
54 compat_os_name,
55 compat_parse_qs,
56 compat_shlex_quote,
57 compat_str,
58 compat_struct_pack,
59 compat_struct_unpack,
60 compat_urllib_error,
61 compat_urllib_parse_unquote_plus,
62 compat_urllib_parse_urlencode,
63 compat_urllib_parse_urlparse,
64 compat_urllib_request,
65 compat_urlparse,
66 )
67 from .dependencies import brotli, certifi, websockets
68 from .socks import ProxyType, sockssocket
69
70
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
80 # This is not clearly defined otherwise
81 compiled_regex_type = type(re.compile(''))
82
83
84 def random_user_agent():
85 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
86 _CHROME_VERSIONS = (
87 '90.0.4430.212',
88 '90.0.4430.24',
89 '90.0.4430.70',
90 '90.0.4430.72',
91 '90.0.4430.85',
92 '90.0.4430.93',
93 '91.0.4472.101',
94 '91.0.4472.106',
95 '91.0.4472.114',
96 '91.0.4472.124',
97 '91.0.4472.164',
98 '91.0.4472.19',
99 '91.0.4472.77',
100 '92.0.4515.107',
101 '92.0.4515.115',
102 '92.0.4515.131',
103 '92.0.4515.159',
104 '92.0.4515.43',
105 '93.0.4556.0',
106 '93.0.4577.15',
107 '93.0.4577.63',
108 '93.0.4577.82',
109 '94.0.4606.41',
110 '94.0.4606.54',
111 '94.0.4606.61',
112 '94.0.4606.71',
113 '94.0.4606.81',
114 '94.0.4606.85',
115 '95.0.4638.17',
116 '95.0.4638.50',
117 '95.0.4638.54',
118 '95.0.4638.69',
119 '95.0.4638.74',
120 '96.0.4664.18',
121 '96.0.4664.45',
122 '96.0.4664.55',
123 '96.0.4664.93',
124 '97.0.4692.20',
125 )
126 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
127
128
129 SUPPORTED_ENCODINGS = [
130 'gzip', 'deflate'
131 ]
132 if brotli:
133 SUPPORTED_ENCODINGS.append('br')
134
135 std_headers = {
136 'User-Agent': random_user_agent(),
137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
138 'Accept-Language': 'en-us,en;q=0.5',
139 'Sec-Fetch-Mode': 'navigate',
140 }
141
142
143 USER_AGENTS = {
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
145 }
146
147
148 NO_DEFAULT = object()
149
150 ENGLISH_MONTH_NAMES = [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
153
154 MONTH_NAMES = {
155 'en': ENGLISH_MONTH_NAMES,
156 'fr': [
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
159 }
160
161 KNOWN_EXTENSIONS = (
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
166 'avi', 'divx',
167 'mov',
168 'asf', 'wmv', 'wma',
169 '3gp', '3g2',
170 'mp3',
171 'flac',
172 'ape',
173 'wav',
174 'f4f', 'f4m', 'm3u8', 'smil')
175
176 # needed for sanitizing filenames in restricted mode
177 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
178 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
180
181 DATE_FORMATS = (
182 '%d %B %Y',
183 '%d %b %Y',
184 '%B %d %Y',
185 '%B %dst %Y',
186 '%B %dnd %Y',
187 '%B %drd %Y',
188 '%B %dth %Y',
189 '%b %d %Y',
190 '%b %dst %Y',
191 '%b %dnd %Y',
192 '%b %drd %Y',
193 '%b %dth %Y',
194 '%b %dst %Y %I:%M',
195 '%b %dnd %Y %I:%M',
196 '%b %drd %Y %I:%M',
197 '%b %dth %Y %I:%M',
198 '%Y %m %d',
199 '%Y-%m-%d',
200 '%Y.%m.%d.',
201 '%Y/%m/%d',
202 '%Y/%m/%d %H:%M',
203 '%Y/%m/%d %H:%M:%S',
204 '%Y%m%d%H%M',
205 '%Y%m%d%H%M%S',
206 '%Y%m%d',
207 '%Y-%m-%d %H:%M',
208 '%Y-%m-%d %H:%M:%S',
209 '%Y-%m-%d %H:%M:%S.%f',
210 '%Y-%m-%d %H:%M:%S:%f',
211 '%d.%m.%Y %H:%M',
212 '%d.%m.%Y %H.%M',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S',
217 '%Y-%m-%dT%H:%M:%S.%f',
218 '%Y-%m-%dT%H:%M',
219 '%b %d %Y at %H:%M',
220 '%b %d %Y at %H:%M:%S',
221 '%B %d %Y at %H:%M',
222 '%B %d %Y at %H:%M:%S',
223 '%H:%M %d-%b-%Y',
224 )
225
226 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
227 DATE_FORMATS_DAY_FIRST.extend([
228 '%d-%m-%Y',
229 '%d.%m.%Y',
230 '%d.%m.%y',
231 '%d/%m/%Y',
232 '%d/%m/%y',
233 '%d/%m/%Y %H:%M:%S',
234 ])
235
236 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
237 DATE_FORMATS_MONTH_FIRST.extend([
238 '%m-%d-%Y',
239 '%m.%d.%Y',
240 '%m/%d/%Y',
241 '%m/%d/%y',
242 '%m/%d/%Y %H:%M:%S',
243 ])
244
245 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
246 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
247
248 NUMBER_RE = r'\d+(?:\.\d+)?'
249
250
251 def preferredencoding():
252 """Get preferred encoding.
253
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
256 """
257 try:
258 pref = locale.getpreferredencoding()
259 'TEST'.encode(pref)
260 except Exception:
261 pref = 'UTF-8'
262
263 return pref
264
265
266 def write_json_file(obj, fn):
267 """ Encode obj as JSON and write it to fn, atomically if possible """
268
269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
271 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
272
273 try:
274 with tf:
275 json.dump(obj, tf, ensure_ascii=False)
276 if sys.platform == 'win32':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
279 with contextlib.suppress(OSError):
280 os.unlink(fn)
281 with contextlib.suppress(OSError):
282 mask = os.umask(0)
283 os.umask(mask)
284 os.chmod(tf.name, 0o666 & ~mask)
285 os.rename(tf.name, fn)
286 except Exception:
287 with contextlib.suppress(OSError):
288 os.remove(tf.name)
289 raise
290
291
292 def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^[a-zA-Z_-]+$', key)
295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
296 return node.find(expr)
297
298 # On python2.6 the xml.etree.ElementTree.Element methods don't support
299 # the namespace parameter
300
301
302 def xpath_with_ns(path, ns_map):
303 components = [c.split(':') for c in path.split('/')]
304 replaced = []
305 for c in components:
306 if len(c) == 1:
307 replaced.append(c[0])
308 else:
309 ns, tag = c
310 replaced.append('{%s}%s' % (ns_map[ns], tag))
311 return '/'.join(replaced)
312
313
314 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
315 def _find_xpath(xpath):
316 return node.find(xpath)
317
318 if isinstance(xpath, (str, compat_str)):
319 n = _find_xpath(xpath)
320 else:
321 for xp in xpath:
322 n = _find_xpath(xp)
323 if n is not None:
324 break
325
326 if n is None:
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
330 name = xpath if name is None else name
331 raise ExtractorError('Could not find XML element %s' % name)
332 else:
333 return None
334 return n
335
336
337 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
338 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
339 if n is None or n == default:
340 return n
341 if n.text is None:
342 if default is not NO_DEFAULT:
343 return default
344 elif fatal:
345 name = xpath if name is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name)
347 else:
348 return None
349 return n.text
350
351
352 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
353 n = find_xpath_attr(node, xpath, key)
354 if n is None:
355 if default is not NO_DEFAULT:
356 return default
357 elif fatal:
358 name = f'{xpath}[@{key}]' if name is None else name
359 raise ExtractorError('Could not find XML attribute %s' % name)
360 else:
361 return None
362 return n.attrib[key]
363
364
365 def get_element_by_id(id, html):
366 """Return the content of the tag with the specified ID in the passed HTML document"""
367 return get_element_by_attribute('id', id, html)
368
369
370 def get_element_html_by_id(id, html):
371 """Return the html of the tag with the specified ID in the passed HTML document"""
372 return get_element_html_by_attribute('id', id, html)
373
374
375 def get_element_by_class(class_name, html):
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
381 def get_element_html_by_class(class_name, html):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval = get_elements_html_by_class(class_name, html)
384 return retval[0] if retval else None
385
386
387 def get_element_by_attribute(attribute, value, html, escape_value=True):
388 retval = get_elements_by_attribute(attribute, value, html, escape_value)
389 return retval[0] if retval else None
390
391
392 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
393 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
394 return retval[0] if retval else None
395
396
397 def get_elements_by_class(class_name, html):
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
400 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
402
403
404 def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
409
410
411 def get_elements_by_attribute(*args, **kwargs):
412 """Return the content of the tag with the specified attribute in the passed HTML document"""
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416 def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
419
420
421 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
422 """
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
425 """
426
427 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
428
429 value = re.escape(value) if escape_value else value
430
431 partial_element_re = rf'''(?x)
432 <(?P<tag>[a-zA-Z0-9:._-]+)
433 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
435 '''
436
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
439
440 yield (
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
442 whole
443 )
444
445
446 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
447 """
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
450 as a context manager
451 """
452
453 class HTMLBreakOnClosingTagException(Exception):
454 pass
455
456 def __init__(self):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
459
460 def __enter__(self):
461 return self
462
463 def __exit__(self, *_):
464 self.close()
465
466 def close(self):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
470 pass
471
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
474
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags in the stack')
478 while self.tagstack:
479 inner_tag = self.tagstack.pop()
480 if inner_tag == tag:
481 break
482 else:
483 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
486
487
488 def get_element_text_and_html_by_tag(tag, html):
489 """
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text) and the whole element (html)
492 """
493 def find_or_raise(haystack, needle, exc):
494 try:
495 return haystack.index(needle)
496 except ValueError:
497 raise exc
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
514 try:
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
521
522
523 class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes for a single element"""
525
526 def __init__(self):
527 self.attrs = {}
528 compat_HTMLParser.__init__(self)
529
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
532
533
534 class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes for the elements of a list"""
536
537 def __init__(self):
538 compat_HTMLParser.__init__(self)
539 self.items = []
540 self._level = 0
541
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
545 self._level += 1
546
547 def handle_endtag(self, tag):
548 self._level -= 1
549
550
551 def extract_attributes(html_element):
552 """Given a string for an HTML element such as
553 <el
554 a="foo" B="bar" c="&98;az" d=boz
555 empty= noval entity="&amp;"
556 sq='"' dq="'"
557 >
558 Decode and return a dictionary of attributes.
559 {
560 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
563 }.
564 """
565 parser = HTMLAttributeParser()
566 with contextlib.suppress(compat_HTMLParseError):
567 parser.feed(html_element)
568 parser.close()
569 return parser.attrs
570
571
572 def parse_list(webpage):
573 """Given a string for an series of HTML <li> elements,
574 return a dictionary of their attributes"""
575 parser = HTMLListAttrsParser()
576 parser.feed(webpage)
577 parser.close()
578 return parser.items
579
580
581 def clean_html(html):
582 """Clean an HTML snippet into a readable string"""
583
584 if html is None: # Convenience for sanitizing descriptions etc.
585 return html
586
587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
590 # Strip html tags
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
594 return html.strip()
595
596
597 def sanitize_open(filename, open_mode):
598 """Try to open the given filename, and slightly tweak it if this fails.
599
600 Attempts to open the given filename. If this fails, it tries to change
601 the filename slightly, step by step, until it's either able to open it
602 or it fails and raises a final exception, like the standard open()
603 function.
604
605 It returns the tuple (stream, definitive_file_name).
606 """
607 if filename == '-':
608 if sys.platform == 'win32':
609 import msvcrt
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
612
613 for attempt in range(2):
614 try:
615 try:
616 if sys.platform == 'win32':
617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the file on windows (for now).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
620 raise LockingUnsupportedError()
621 stream = locked_file(filename, open_mode, block=False).__enter__()
622 except LockingUnsupportedError:
623 stream = open(filename, open_mode)
624 return (stream, filename)
625 except OSError as err:
626 if attempt or err.errno in (errno.EACCES,):
627 raise
628 old_filename, filename = filename, sanitize_path(filename)
629 if old_filename == filename:
630 raise
631
632
633 def timeconvert(timestr):
634 """Convert RFC 2822 defined time string into system timestamp"""
635 timestamp = None
636 timetuple = email.utils.parsedate_tz(timestr)
637 if timetuple is not None:
638 timestamp = email.utils.mktime_tz(timetuple)
639 return timestamp
640
641
642 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
643 """Sanitizes a string so it could be used as part of a filename.
644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
647 """
648 if s == '':
649 return ''
650
651 def replace_insane(char):
652 if restricted and char in ACCENT_CHARS:
653 return ACCENT_CHARS[char]
654 elif not restricted and char == '\n':
655 return '\0 '
656 elif char == '?' or ord(char) < 32 or ord(char) == 127:
657 return ''
658 elif char == '"':
659 return '' if restricted else '\''
660 elif char == ':':
661 return '\0_\0-' if restricted else '\0 \0-'
662 elif char in '\\/|*<>':
663 return '\0_'
664 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
665 return '\0_'
666 return char
667
668 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
669 result = ''.join(map(replace_insane, s))
670 if is_id is NO_DEFAULT:
671 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
672 STRIP_RE = '(?:\0.|[ _-])*'
673 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
674 result = result.replace('\0', '') or '_'
675
676 if not is_id:
677 while '__' in result:
678 result = result.replace('__', '_')
679 result = result.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted and result.startswith('-_'):
682 result = result[2:]
683 if result.startswith('-'):
684 result = '_' + result[len('-'):]
685 result = result.lstrip('.')
686 if not result:
687 result = '_'
688 return result
689
690
691 def sanitize_path(s, force=False):
692 """Sanitizes and normalizes path on Windows"""
693 if sys.platform == 'win32':
694 force = False
695 drive_or_unc, _ = os.path.splitdrive(s)
696 elif force:
697 drive_or_unc = ''
698 else:
699 return s
700
701 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
702 if drive_or_unc:
703 norm_path.pop(0)
704 sanitized_path = [
705 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
706 for path_part in norm_path]
707 if drive_or_unc:
708 sanitized_path.insert(0, drive_or_unc + os.path.sep)
709 elif force and s and s[0] == os.path.sep:
710 sanitized_path.insert(0, os.path.sep)
711 return os.path.join(*sanitized_path)
712
713
714 def sanitize_url(url):
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
717 if url.startswith('//'):
718 return 'http:%s' % url
719 # Fix some common typos seen so far
720 COMMON_TYPOS = (
721 # https://github.com/ytdl-org/youtube-dl/issues/15649
722 (r'^httpss://', r'https://'),
723 # https://bx1.be/lives/direct-tv/
724 (r'^rmtp([es]?)://', r'rtmp\1://'),
725 )
726 for mistake, fixup in COMMON_TYPOS:
727 if re.match(mistake, url):
728 return re.sub(mistake, fixup, url)
729 return url
730
731
732 def extract_basic_auth(url):
733 parts = compat_urlparse.urlsplit(url)
734 if parts.username is None:
735 return url, None
736 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
737 parts.hostname if parts.port is None
738 else '%s:%d' % (parts.hostname, parts.port))))
739 auth_payload = base64.b64encode(
740 ('%s:%s' % (parts.username, parts.password or '')).encode())
741 return url, f'Basic {auth_payload.decode()}'
742
743
744 def sanitized_Request(url, *args, **kwargs):
745 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
746 if auth_header is not None:
747 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
748 headers['Authorization'] = auth_header
749 return compat_urllib_request.Request(url, *args, **kwargs)
750
751
752 def expand_path(s):
753 """Expand shell variables and ~"""
754 return os.path.expandvars(compat_expanduser(s))
755
756
757 def orderedSet(iterable):
758 """ Remove all duplicates from the input iterable """
759 res = []
760 for el in iterable:
761 if el not in res:
762 res.append(el)
763 return res
764
765
766 def _htmlentity_transform(entity_with_semicolon):
767 """Transforms an HTML entity to a character."""
768 entity = entity_with_semicolon[:-1]
769
770 # Known non-numeric HTML entity
771 if entity in compat_html_entities.name2codepoint:
772 return compat_chr(compat_html_entities.name2codepoint[entity])
773
774 # TODO: HTML5 allows entities without a semicolon. For example,
775 # '&Eacuteric' should be decoded as 'Éric'.
776 if entity_with_semicolon in compat_html_entities_html5:
777 return compat_html_entities_html5[entity_with_semicolon]
778
779 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
780 if mobj is not None:
781 numstr = mobj.group(1)
782 if numstr.startswith('x'):
783 base = 16
784 numstr = '0%s' % numstr
785 else:
786 base = 10
787 # See https://github.com/ytdl-org/youtube-dl/issues/7518
788 with contextlib.suppress(ValueError):
789 return compat_chr(int(numstr, base))
790
791 # Unknown entity in name, return its literal representation
792 return '&%s;' % entity
793
794
795 def unescapeHTML(s):
796 if s is None:
797 return None
798 assert isinstance(s, str)
799
800 return re.sub(
801 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
802
803
804 def escapeHTML(text):
805 return (
806 text
807 .replace('&', '&amp;')
808 .replace('<', '&lt;')
809 .replace('>', '&gt;')
810 .replace('"', '&quot;')
811 .replace("'", '&#39;')
812 )
813
814
815 def process_communicate_or_kill(p, *args, **kwargs):
816 try:
817 return p.communicate(*args, **kwargs)
818 except BaseException: # Including KeyboardInterrupt
819 p.kill()
820 p.wait()
821 raise
822
823
824 class Popen(subprocess.Popen):
825 if sys.platform == 'win32':
826 _startupinfo = subprocess.STARTUPINFO()
827 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
828 else:
829 _startupinfo = None
830
831 def __init__(self, *args, **kwargs):
832 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
833
834 def communicate_or_kill(self, *args, **kwargs):
835 return process_communicate_or_kill(self, *args, **kwargs)
836
837
838 def get_subprocess_encoding():
839 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
840 # For subprocess calls, encode with locale encoding
841 # Refer to http://stackoverflow.com/a/9951851/35070
842 encoding = preferredencoding()
843 else:
844 encoding = sys.getfilesystemencoding()
845 if encoding is None:
846 encoding = 'utf-8'
847 return encoding
848
849
850 def encodeFilename(s, for_subprocess=False):
851 assert isinstance(s, str)
852 return s
853
854
855 def decodeFilename(b, for_subprocess=False):
856 return b
857
858
859 def encodeArgument(s):
860 # Legacy code that uses byte strings
861 # Uncomment the following line after fixing all post processors
862 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
863 return s if isinstance(s, str) else s.decode('ascii')
864
865
866 def decodeArgument(b):
867 return b
868
869
870 def decodeOption(optval):
871 if optval is None:
872 return optval
873 if isinstance(optval, bytes):
874 optval = optval.decode(preferredencoding())
875
876 assert isinstance(optval, compat_str)
877 return optval
878
879
880 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
881
882
883 def timetuple_from_msec(msec):
884 secs, msec = divmod(msec, 1000)
885 mins, secs = divmod(secs, 60)
886 hrs, mins = divmod(mins, 60)
887 return _timetuple(hrs, mins, secs, msec)
888
889
890 def formatSeconds(secs, delim=':', msec=False):
891 time = timetuple_from_msec(secs * 1000)
892 if time.hours:
893 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
894 elif time.minutes:
895 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
896 else:
897 ret = '%d' % time.seconds
898 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
899
900
901 def _ssl_load_windows_store_certs(ssl_context, storename):
902 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
903 try:
904 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
905 if encoding == 'x509_asn' and (
906 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
907 except PermissionError:
908 return
909 for cert in certs:
910 with contextlib.suppress(ssl.SSLError):
911 ssl_context.load_verify_locations(cadata=cert)
912
913
914 def make_HTTPS_handler(params, **kwargs):
915 opts_check_certificate = not params.get('nocheckcertificate')
916 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
917 context.check_hostname = opts_check_certificate
918 if params.get('legacyserverconnect'):
919 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
920 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
921 context.set_ciphers('DEFAULT')
922 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
923 if opts_check_certificate:
924 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
925 context.load_verify_locations(cafile=certifi.where())
926 else:
927 try:
928 context.load_default_certs()
929 # Work around the issue in load_default_certs when there are bad certificates. See:
930 # https://github.com/yt-dlp/yt-dlp/issues/1060,
931 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
932 except ssl.SSLError:
933 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
934 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
935 for storename in ('CA', 'ROOT'):
936 _ssl_load_windows_store_certs(context, storename)
937 context.set_default_verify_paths()
938 client_certfile = params.get('client_certificate')
939 if client_certfile:
940 try:
941 context.load_cert_chain(
942 client_certfile, keyfile=params.get('client_certificate_key'),
943 password=params.get('client_certificate_password'))
944 except ssl.SSLError:
945 raise YoutubeDLError('Unable to load client certificate')
946 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
947
948
949 def bug_reports_message(before=';'):
950 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
951 'filling out the appropriate issue template. '
952 'Confirm you are on the latest version using yt-dlp -U')
953
954 before = before.rstrip()
955 if not before or before.endswith(('.', '!', '?')):
956 msg = msg[0].title() + msg[1:]
957
958 return (before + ' ' if before else '') + msg
959
960
961 class YoutubeDLError(Exception):
962 """Base exception for YoutubeDL errors."""
963 msg = None
964
965 def __init__(self, msg=None):
966 if msg is not None:
967 self.msg = msg
968 elif self.msg is None:
969 self.msg = type(self).__name__
970 super().__init__(self.msg)
971
972
973 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
974 if hasattr(ssl, 'CertificateError'):
975 network_exceptions.append(ssl.CertificateError)
976 network_exceptions = tuple(network_exceptions)
977
978
979 class ExtractorError(YoutubeDLError):
980 """Error during info extraction."""
981
982 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
983 """ tb, if given, is the original traceback (so that it can be printed out).
984 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
985 """
986 if sys.exc_info()[0] in network_exceptions:
987 expected = True
988
989 self.orig_msg = str(msg)
990 self.traceback = tb
991 self.expected = expected
992 self.cause = cause
993 self.video_id = video_id
994 self.ie = ie
995 self.exc_info = sys.exc_info() # preserve original exception
996
997 super().__init__(''.join((
998 format_field(ie, template='[%s] '),
999 format_field(video_id, template='%s: '),
1000 msg,
1001 format_field(cause, template=' (caused by %r)'),
1002 '' if expected else bug_reports_message())))
1003
1004 def format_traceback(self):
1005 return join_nonempty(
1006 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1007 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1008 delim='\n') or None
1009
1010
1011 class UnsupportedError(ExtractorError):
1012 def __init__(self, url):
1013 super().__init__(
1014 'Unsupported URL: %s' % url, expected=True)
1015 self.url = url
1016
1017
1018 class RegexNotFoundError(ExtractorError):
1019 """Error when a regex didn't match"""
1020 pass
1021
1022
1023 class GeoRestrictedError(ExtractorError):
1024 """Geographic restriction Error exception.
1025
1026 This exception may be thrown when a video is not available from your
1027 geographic location due to geographic restrictions imposed by a website.
1028 """
1029
1030 def __init__(self, msg, countries=None, **kwargs):
1031 kwargs['expected'] = True
1032 super().__init__(msg, **kwargs)
1033 self.countries = countries
1034
1035
1036 class DownloadError(YoutubeDLError):
1037 """Download Error exception.
1038
1039 This exception may be thrown by FileDownloader objects if they are not
1040 configured to continue on errors. They will contain the appropriate
1041 error message.
1042 """
1043
1044 def __init__(self, msg, exc_info=None):
1045 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1046 super().__init__(msg)
1047 self.exc_info = exc_info
1048
1049
1050 class EntryNotInPlaylist(YoutubeDLError):
1051 """Entry not in playlist exception.
1052
1053 This exception will be thrown by YoutubeDL when a requested entry
1054 is not found in the playlist info_dict
1055 """
1056 msg = 'Entry not found in info'
1057
1058
1059 class SameFileError(YoutubeDLError):
1060 """Same File exception.
1061
1062 This exception will be thrown by FileDownloader objects if they detect
1063 multiple files would have to be downloaded to the same file on disk.
1064 """
1065 msg = 'Fixed output name but more than one file to download'
1066
1067 def __init__(self, filename=None):
1068 if filename is not None:
1069 self.msg += f': {filename}'
1070 super().__init__(self.msg)
1071
1072
1073 class PostProcessingError(YoutubeDLError):
1074 """Post Processing exception.
1075
1076 This exception may be raised by PostProcessor's .run() method to
1077 indicate an error in the postprocessing task.
1078 """
1079
1080
1081 class DownloadCancelled(YoutubeDLError):
1082 """ Exception raised when the download queue should be interrupted """
1083 msg = 'The download was cancelled'
1084
1085
1086 class ExistingVideoReached(DownloadCancelled):
1087 """ --break-on-existing triggered """
1088 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1089
1090
1091 class RejectedVideoReached(DownloadCancelled):
1092 """ --break-on-reject triggered """
1093 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1094
1095
1096 class MaxDownloadsReached(DownloadCancelled):
1097 """ --max-downloads limit has been reached. """
1098 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1099
1100
1101 class ReExtractInfo(YoutubeDLError):
1102 """ Video info needs to be re-extracted. """
1103
1104 def __init__(self, msg, expected=False):
1105 super().__init__(msg)
1106 self.expected = expected
1107
1108
1109 class ThrottledDownload(ReExtractInfo):
1110 """ Download speed below --throttled-rate. """
1111 msg = 'The download speed is below throttle limit'
1112
1113 def __init__(self):
1114 super().__init__(self.msg, expected=False)
1115
1116
1117 class UnavailableVideoError(YoutubeDLError):
1118 """Unavailable Format exception.
1119
1120 This exception will be thrown when a video is requested
1121 in a format that is not available for that video.
1122 """
1123 msg = 'Unable to download video'
1124
1125 def __init__(self, err=None):
1126 if err is not None:
1127 self.msg += f': {err}'
1128 super().__init__(self.msg)
1129
1130
1131 class ContentTooShortError(YoutubeDLError):
1132 """Content Too Short exception.
1133
1134 This exception may be raised by FileDownloader objects when a file they
1135 download is too small for what the server announced first, indicating
1136 the connection was probably interrupted.
1137 """
1138
1139 def __init__(self, downloaded, expected):
1140 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1141 # Both in bytes
1142 self.downloaded = downloaded
1143 self.expected = expected
1144
1145
1146 class XAttrMetadataError(YoutubeDLError):
1147 def __init__(self, code=None, msg='Unknown error'):
1148 super().__init__(msg)
1149 self.code = code
1150 self.msg = msg
1151
1152 # Parsing code and msg
1153 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1154 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1155 self.reason = 'NO_SPACE'
1156 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1157 self.reason = 'VALUE_TOO_LONG'
1158 else:
1159 self.reason = 'NOT_SUPPORTED'
1160
1161
1162 class XAttrUnavailableError(YoutubeDLError):
1163 pass
1164
1165
1166 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1167 hc = http_class(*args, **kwargs)
1168 source_address = ydl_handler._params.get('source_address')
1169
1170 if source_address is not None:
1171 # This is to workaround _create_connection() from socket where it will try all
1172 # address data from getaddrinfo() including IPv6. This filters the result from
1173 # getaddrinfo() based on the source_address value.
1174 # This is based on the cpython socket.create_connection() function.
1175 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1176 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1177 host, port = address
1178 err = None
1179 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1180 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1181 ip_addrs = [addr for addr in addrs if addr[0] == af]
1182 if addrs and not ip_addrs:
1183 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1184 raise OSError(
1185 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1186 % (ip_version, source_address[0]))
1187 for res in ip_addrs:
1188 af, socktype, proto, canonname, sa = res
1189 sock = None
1190 try:
1191 sock = socket.socket(af, socktype, proto)
1192 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1193 sock.settimeout(timeout)
1194 sock.bind(source_address)
1195 sock.connect(sa)
1196 err = None # Explicitly break reference cycle
1197 return sock
1198 except OSError as _:
1199 err = _
1200 if sock is not None:
1201 sock.close()
1202 if err is not None:
1203 raise err
1204 else:
1205 raise OSError('getaddrinfo returns an empty list')
1206 if hasattr(hc, '_create_connection'):
1207 hc._create_connection = _create_connection
1208 hc.source_address = (source_address, 0)
1209
1210 return hc
1211
1212
1213 def handle_youtubedl_headers(headers):
1214 filtered_headers = headers
1215
1216 if 'Youtubedl-no-compression' in filtered_headers:
1217 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1218 del filtered_headers['Youtubedl-no-compression']
1219
1220 return filtered_headers
1221
1222
1223 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1224 """Handler for HTTP requests and responses.
1225
1226 This class, when installed with an OpenerDirector, automatically adds
1227 the standard headers to every HTTP request and handles gzipped and
1228 deflated responses from web servers. If compression is to be avoided in
1229 a particular request, the original request in the program code only has
1230 to include the HTTP header "Youtubedl-no-compression", which will be
1231 removed before making the real request.
1232
1233 Part of this code was copied from:
1234
1235 http://techknack.net/python-urllib2-handlers/
1236
1237 Andrew Rowls, the author of that code, agreed to release it to the
1238 public domain.
1239 """
1240
1241 def __init__(self, params, *args, **kwargs):
1242 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1243 self._params = params
1244
1245 def http_open(self, req):
1246 conn_class = compat_http_client.HTTPConnection
1247
1248 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1249 if socks_proxy:
1250 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1251 del req.headers['Ytdl-socks-proxy']
1252
1253 return self.do_open(functools.partial(
1254 _create_http_connection, self, conn_class, False),
1255 req)
1256
1257 @staticmethod
1258 def deflate(data):
1259 if not data:
1260 return data
1261 try:
1262 return zlib.decompress(data, -zlib.MAX_WBITS)
1263 except zlib.error:
1264 return zlib.decompress(data)
1265
1266 @staticmethod
1267 def brotli(data):
1268 if not data:
1269 return data
1270 return brotli.decompress(data)
1271
1272 def http_request(self, req):
1273 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1274 # always respected by websites, some tend to give out URLs with non percent-encoded
1275 # non-ASCII characters (see telemb.py, ard.py [#3412])
1276 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1277 # To work around aforementioned issue we will replace request's original URL with
1278 # percent-encoded one
1279 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1280 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1281 url = req.get_full_url()
1282 url_escaped = escape_url(url)
1283
1284 # Substitute URL if any change after escaping
1285 if url != url_escaped:
1286 req = update_Request(req, url=url_escaped)
1287
1288 for h, v in self._params.get('http_headers', std_headers).items():
1289 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1290 # The dict keys are capitalized because of this bug by urllib
1291 if h.capitalize() not in req.headers:
1292 req.add_header(h, v)
1293
1294 if 'Accept-encoding' not in req.headers:
1295 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1296
1297 req.headers = handle_youtubedl_headers(req.headers)
1298
1299 return req
1300
1301 def http_response(self, req, resp):
1302 old_resp = resp
1303 # gzip
1304 if resp.headers.get('Content-encoding', '') == 'gzip':
1305 content = resp.read()
1306 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1307 try:
1308 uncompressed = io.BytesIO(gz.read())
1309 except OSError as original_ioerror:
1310 # There may be junk add the end of the file
1311 # See http://stackoverflow.com/q/4928560/35070 for details
1312 for i in range(1, 1024):
1313 try:
1314 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1315 uncompressed = io.BytesIO(gz.read())
1316 except OSError:
1317 continue
1318 break
1319 else:
1320 raise original_ioerror
1321 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1322 resp.msg = old_resp.msg
1323 del resp.headers['Content-encoding']
1324 # deflate
1325 if resp.headers.get('Content-encoding', '') == 'deflate':
1326 gz = io.BytesIO(self.deflate(resp.read()))
1327 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1328 resp.msg = old_resp.msg
1329 del resp.headers['Content-encoding']
1330 # brotli
1331 if resp.headers.get('Content-encoding', '') == 'br':
1332 resp = compat_urllib_request.addinfourl(
1333 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1334 resp.msg = old_resp.msg
1335 del resp.headers['Content-encoding']
1336 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1337 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1338 if 300 <= resp.code < 400:
1339 location = resp.headers.get('Location')
1340 if location:
1341 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1342 location = location.encode('iso-8859-1').decode()
1343 location_escaped = escape_url(location)
1344 if location != location_escaped:
1345 del resp.headers['Location']
1346 resp.headers['Location'] = location_escaped
1347 return resp
1348
1349 https_request = http_request
1350 https_response = http_response
1351
1352
1353 def make_socks_conn_class(base_class, socks_proxy):
1354 assert issubclass(base_class, (
1355 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1356
1357 url_components = compat_urlparse.urlparse(socks_proxy)
1358 if url_components.scheme.lower() == 'socks5':
1359 socks_type = ProxyType.SOCKS5
1360 elif url_components.scheme.lower() in ('socks', 'socks4'):
1361 socks_type = ProxyType.SOCKS4
1362 elif url_components.scheme.lower() == 'socks4a':
1363 socks_type = ProxyType.SOCKS4A
1364
1365 def unquote_if_non_empty(s):
1366 if not s:
1367 return s
1368 return compat_urllib_parse_unquote_plus(s)
1369
1370 proxy_args = (
1371 socks_type,
1372 url_components.hostname, url_components.port or 1080,
1373 True, # Remote DNS
1374 unquote_if_non_empty(url_components.username),
1375 unquote_if_non_empty(url_components.password),
1376 )
1377
1378 class SocksConnection(base_class):
1379 def connect(self):
1380 self.sock = sockssocket()
1381 self.sock.setproxy(*proxy_args)
1382 if isinstance(self.timeout, (int, float)):
1383 self.sock.settimeout(self.timeout)
1384 self.sock.connect((self.host, self.port))
1385
1386 if isinstance(self, compat_http_client.HTTPSConnection):
1387 if hasattr(self, '_context'): # Python > 2.6
1388 self.sock = self._context.wrap_socket(
1389 self.sock, server_hostname=self.host)
1390 else:
1391 self.sock = ssl.wrap_socket(self.sock)
1392
1393 return SocksConnection
1394
1395
1396 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1397 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1398 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1399 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1400 self._params = params
1401
1402 def https_open(self, req):
1403 kwargs = {}
1404 conn_class = self._https_conn_class
1405
1406 if hasattr(self, '_context'): # python > 2.6
1407 kwargs['context'] = self._context
1408 if hasattr(self, '_check_hostname'): # python 3.x
1409 kwargs['check_hostname'] = self._check_hostname
1410
1411 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1412 if socks_proxy:
1413 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1414 del req.headers['Ytdl-socks-proxy']
1415
1416 try:
1417 return self.do_open(
1418 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1419 except urllib.error.URLError as e:
1420 if (isinstance(e.reason, ssl.SSLError)
1421 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1422 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1423 raise
1424
1425
1426 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1427 """
1428 See [1] for cookie file format.
1429
1430 1. https://curl.haxx.se/docs/http-cookies.html
1431 """
1432 _HTTPONLY_PREFIX = '#HttpOnly_'
1433 _ENTRY_LEN = 7
1434 _HEADER = '''# Netscape HTTP Cookie File
1435 # This file is generated by yt-dlp. Do not edit.
1436
1437 '''
1438 _CookieFileEntry = collections.namedtuple(
1439 'CookieFileEntry',
1440 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1441
1442 def __init__(self, filename=None, *args, **kwargs):
1443 super().__init__(None, *args, **kwargs)
1444 if self.is_path(filename):
1445 filename = os.fspath(filename)
1446 self.filename = filename
1447
1448 @staticmethod
1449 def is_path(file):
1450 return isinstance(file, (str, bytes, os.PathLike))
1451
1452 @contextlib.contextmanager
1453 def open(self, file, *, write=False):
1454 if self.is_path(file):
1455 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1456 yield f
1457 else:
1458 if write:
1459 file.truncate(0)
1460 yield file
1461
1462 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1463 """
1464 Save cookies to a file.
1465
1466 Most of the code is taken from CPython 3.8 and slightly adapted
1467 to support cookie files with UTF-8 in both python 2 and 3.
1468 """
1469 if filename is None:
1470 if self.filename is not None:
1471 filename = self.filename
1472 else:
1473 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1474
1475 # Store session cookies with `expires` set to 0 instead of an empty
1476 # string
1477 for cookie in self:
1478 if cookie.expires is None:
1479 cookie.expires = 0
1480
1481 with self.open(filename, write=True) as f:
1482 f.write(self._HEADER)
1483 now = time.time()
1484 for cookie in self:
1485 if not ignore_discard and cookie.discard:
1486 continue
1487 if not ignore_expires and cookie.is_expired(now):
1488 continue
1489 if cookie.secure:
1490 secure = 'TRUE'
1491 else:
1492 secure = 'FALSE'
1493 if cookie.domain.startswith('.'):
1494 initial_dot = 'TRUE'
1495 else:
1496 initial_dot = 'FALSE'
1497 if cookie.expires is not None:
1498 expires = compat_str(cookie.expires)
1499 else:
1500 expires = ''
1501 if cookie.value is None:
1502 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1503 # with no name, whereas http.cookiejar regards it as a
1504 # cookie with no value.
1505 name = ''
1506 value = cookie.name
1507 else:
1508 name = cookie.name
1509 value = cookie.value
1510 f.write(
1511 '\t'.join([cookie.domain, initial_dot, cookie.path,
1512 secure, expires, name, value]) + '\n')
1513
1514 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1515 """Load cookies from a file."""
1516 if filename is None:
1517 if self.filename is not None:
1518 filename = self.filename
1519 else:
1520 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1521
1522 def prepare_line(line):
1523 if line.startswith(self._HTTPONLY_PREFIX):
1524 line = line[len(self._HTTPONLY_PREFIX):]
1525 # comments and empty lines are fine
1526 if line.startswith('#') or not line.strip():
1527 return line
1528 cookie_list = line.split('\t')
1529 if len(cookie_list) != self._ENTRY_LEN:
1530 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1531 cookie = self._CookieFileEntry(*cookie_list)
1532 if cookie.expires_at and not cookie.expires_at.isdigit():
1533 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1534 return line
1535
1536 cf = io.StringIO()
1537 with self.open(filename) as f:
1538 for line in f:
1539 try:
1540 cf.write(prepare_line(line))
1541 except compat_cookiejar.LoadError as e:
1542 if f'{line.strip()} '[0] in '[{"':
1543 raise compat_cookiejar.LoadError(
1544 'Cookies file must be Netscape formatted, not JSON. See '
1545 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1546 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1547 continue
1548 cf.seek(0)
1549 self._really_load(cf, filename, ignore_discard, ignore_expires)
1550 # Session cookies are denoted by either `expires` field set to
1551 # an empty string or 0. MozillaCookieJar only recognizes the former
1552 # (see [1]). So we need force the latter to be recognized as session
1553 # cookies on our own.
1554 # Session cookies may be important for cookies-based authentication,
1555 # e.g. usually, when user does not check 'Remember me' check box while
1556 # logging in on a site, some important cookies are stored as session
1557 # cookies so that not recognizing them will result in failed login.
1558 # 1. https://bugs.python.org/issue17164
1559 for cookie in self:
1560 # Treat `expires=0` cookies as session cookies
1561 if cookie.expires == 0:
1562 cookie.expires = None
1563 cookie.discard = True
1564
1565
1566 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1567 def __init__(self, cookiejar=None):
1568 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1569
1570 def http_response(self, request, response):
1571 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1572
1573 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1574 https_response = http_response
1575
1576
1577 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1578 """YoutubeDL redirect handler
1579
1580 The code is based on HTTPRedirectHandler implementation from CPython [1].
1581
1582 This redirect handler solves two issues:
1583 - ensures redirect URL is always unicode under python 2
1584 - introduces support for experimental HTTP response status code
1585 308 Permanent Redirect [2] used by some sites [3]
1586
1587 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1588 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1589 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1590 """
1591
1592 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1593
1594 def redirect_request(self, req, fp, code, msg, headers, newurl):
1595 """Return a Request or None in response to a redirect.
1596
1597 This is called by the http_error_30x methods when a
1598 redirection response is received. If a redirection should
1599 take place, return a new Request to allow http_error_30x to
1600 perform the redirect. Otherwise, raise HTTPError if no-one
1601 else should try to handle this url. Return None if you can't
1602 but another Handler might.
1603 """
1604 m = req.get_method()
1605 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1606 or code in (301, 302, 303) and m == "POST")):
1607 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1608 # Strictly (according to RFC 2616), 301 or 302 in response to
1609 # a POST MUST NOT cause a redirection without confirmation
1610 # from the user (of urllib.request, in this case). In practice,
1611 # essentially all clients do redirect in this case, so we do
1612 # the same.
1613
1614 # Be conciliant with URIs containing a space. This is mainly
1615 # redundant with the more complete encoding done in http_error_302(),
1616 # but it is kept for compatibility with other callers.
1617 newurl = newurl.replace(' ', '%20')
1618
1619 CONTENT_HEADERS = ("content-length", "content-type")
1620 # NB: don't use dict comprehension for python 2.6 compatibility
1621 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1622
1623 # A 303 must either use GET or HEAD for subsequent request
1624 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1625 if code == 303 and m != 'HEAD':
1626 m = 'GET'
1627 # 301 and 302 redirects are commonly turned into a GET from a POST
1628 # for subsequent requests by browsers, so we'll do the same.
1629 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1630 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1631 if code in (301, 302) and m == 'POST':
1632 m = 'GET'
1633
1634 return compat_urllib_request.Request(
1635 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1636 unverifiable=True, method=m)
1637
1638
1639 def extract_timezone(date_str):
1640 m = re.search(
1641 r'''(?x)
1642 ^.{8,}? # >=8 char non-TZ prefix, if present
1643 (?P<tz>Z| # just the UTC Z, or
1644 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1645 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1646 [ ]? # optional space
1647 (?P<sign>\+|-) # +/-
1648 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1649 $)
1650 ''', date_str)
1651 if not m:
1652 timezone = datetime.timedelta()
1653 else:
1654 date_str = date_str[:-len(m.group('tz'))]
1655 if not m.group('sign'):
1656 timezone = datetime.timedelta()
1657 else:
1658 sign = 1 if m.group('sign') == '+' else -1
1659 timezone = datetime.timedelta(
1660 hours=sign * int(m.group('hours')),
1661 minutes=sign * int(m.group('minutes')))
1662 return timezone, date_str
1663
1664
1665 def parse_iso8601(date_str, delimiter='T', timezone=None):
1666 """ Return a UNIX timestamp from the given date """
1667
1668 if date_str is None:
1669 return None
1670
1671 date_str = re.sub(r'\.[0-9]+', '', date_str)
1672
1673 if timezone is None:
1674 timezone, date_str = extract_timezone(date_str)
1675
1676 with contextlib.suppress(ValueError):
1677 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1678 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1679 return calendar.timegm(dt.timetuple())
1680
1681
1682 def date_formats(day_first=True):
1683 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1684
1685
1686 def unified_strdate(date_str, day_first=True):
1687 """Return a string with the date in the format YYYYMMDD"""
1688
1689 if date_str is None:
1690 return None
1691 upload_date = None
1692 # Replace commas
1693 date_str = date_str.replace(',', ' ')
1694 # Remove AM/PM + timezone
1695 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1696 _, date_str = extract_timezone(date_str)
1697
1698 for expression in date_formats(day_first):
1699 with contextlib.suppress(ValueError):
1700 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1701 if upload_date is None:
1702 timetuple = email.utils.parsedate_tz(date_str)
1703 if timetuple:
1704 with contextlib.suppress(ValueError):
1705 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1706 if upload_date is not None:
1707 return compat_str(upload_date)
1708
1709
1710 def unified_timestamp(date_str, day_first=True):
1711 if date_str is None:
1712 return None
1713
1714 date_str = re.sub(r'[,|]', '', date_str)
1715
1716 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1717 timezone, date_str = extract_timezone(date_str)
1718
1719 # Remove AM/PM + timezone
1720 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1721
1722 # Remove unrecognized timezones from ISO 8601 alike timestamps
1723 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1724 if m:
1725 date_str = date_str[:-len(m.group('tz'))]
1726
1727 # Python only supports microseconds, so remove nanoseconds
1728 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1729 if m:
1730 date_str = m.group(1)
1731
1732 for expression in date_formats(day_first):
1733 with contextlib.suppress(ValueError):
1734 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1735 return calendar.timegm(dt.timetuple())
1736 timetuple = email.utils.parsedate_tz(date_str)
1737 if timetuple:
1738 return calendar.timegm(timetuple) + pm_delta * 3600
1739
1740
1741 def determine_ext(url, default_ext='unknown_video'):
1742 if url is None or '.' not in url:
1743 return default_ext
1744 guess = url.partition('?')[0].rpartition('.')[2]
1745 if re.match(r'^[A-Za-z0-9]+$', guess):
1746 return guess
1747 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1748 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1749 return guess.rstrip('/')
1750 else:
1751 return default_ext
1752
1753
1754 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1755 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1756
1757
1758 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1759 R"""
1760 Return a datetime object from a string.
1761 Supported format:
1762 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1763
1764 @param format strftime format of DATE
1765 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1766 auto: round to the unit provided in date_str (if applicable).
1767 """
1768 auto_precision = False
1769 if precision == 'auto':
1770 auto_precision = True
1771 precision = 'microsecond'
1772 today = datetime_round(datetime.datetime.utcnow(), precision)
1773 if date_str in ('now', 'today'):
1774 return today
1775 if date_str == 'yesterday':
1776 return today - datetime.timedelta(days=1)
1777 match = re.match(
1778 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1779 date_str)
1780 if match is not None:
1781 start_time = datetime_from_str(match.group('start'), precision, format)
1782 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1783 unit = match.group('unit')
1784 if unit == 'month' or unit == 'year':
1785 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1786 unit = 'day'
1787 else:
1788 if unit == 'week':
1789 unit = 'day'
1790 time *= 7
1791 delta = datetime.timedelta(**{unit + 's': time})
1792 new_date = start_time + delta
1793 if auto_precision:
1794 return datetime_round(new_date, unit)
1795 return new_date
1796
1797 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1798
1799
1800 def date_from_str(date_str, format='%Y%m%d', strict=False):
1801 R"""
1802 Return a date object from a string using datetime_from_str
1803
1804 @param strict Restrict allowed patterns to "YYYYMMDD" and
1805 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1806 """
1807 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1808 raise ValueError(f'Invalid date format "{date_str}"')
1809 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1810
1811
1812 def datetime_add_months(dt, months):
1813 """Increment/Decrement a datetime object by months."""
1814 month = dt.month + months - 1
1815 year = dt.year + month // 12
1816 month = month % 12 + 1
1817 day = min(dt.day, calendar.monthrange(year, month)[1])
1818 return dt.replace(year, month, day)
1819
1820
1821 def datetime_round(dt, precision='day'):
1822 """
1823 Round a datetime object's time to a specific precision
1824 """
1825 if precision == 'microsecond':
1826 return dt
1827
1828 unit_seconds = {
1829 'day': 86400,
1830 'hour': 3600,
1831 'minute': 60,
1832 'second': 1,
1833 }
1834 roundto = lambda x, n: ((x + n / 2) // n) * n
1835 timestamp = calendar.timegm(dt.timetuple())
1836 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1837
1838
1839 def hyphenate_date(date_str):
1840 """
1841 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1842 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1843 if match is not None:
1844 return '-'.join(match.groups())
1845 else:
1846 return date_str
1847
1848
1849 class DateRange:
1850 """Represents a time interval between two dates"""
1851
1852 def __init__(self, start=None, end=None):
1853 """start and end must be strings in the format accepted by date"""
1854 if start is not None:
1855 self.start = date_from_str(start, strict=True)
1856 else:
1857 self.start = datetime.datetime.min.date()
1858 if end is not None:
1859 self.end = date_from_str(end, strict=True)
1860 else:
1861 self.end = datetime.datetime.max.date()
1862 if self.start > self.end:
1863 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1864
1865 @classmethod
1866 def day(cls, day):
1867 """Returns a range that only contains the given day"""
1868 return cls(day, day)
1869
1870 def __contains__(self, date):
1871 """Check if the date is in the range"""
1872 if not isinstance(date, datetime.date):
1873 date = date_from_str(date)
1874 return self.start <= date <= self.end
1875
1876 def __str__(self):
1877 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1878
1879
1880 def platform_name():
1881 """ Returns the platform name as a compat_str """
1882 res = platform.platform()
1883 if isinstance(res, bytes):
1884 res = res.decode(preferredencoding())
1885
1886 assert isinstance(res, compat_str)
1887 return res
1888
1889
1890 def get_windows_version():
1891 ''' Get Windows version. None if it's not running on Windows '''
1892 if compat_os_name == 'nt':
1893 return version_tuple(platform.win32_ver()[1])
1894 else:
1895 return None
1896
1897
1898 def write_string(s, out=None, encoding=None):
1899 assert isinstance(s, str)
1900 out = out or sys.stderr
1901
1902 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1903 s = re.sub(r'([\r\n]+)', r' \1', s)
1904
1905 if 'b' in getattr(out, 'mode', ''):
1906 byt = s.encode(encoding or preferredencoding(), 'ignore')
1907 out.write(byt)
1908 elif hasattr(out, 'buffer'):
1909 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1910 byt = s.encode(enc, 'ignore')
1911 out.buffer.write(byt)
1912 else:
1913 out.write(s)
1914 out.flush()
1915
1916
1917 def bytes_to_intlist(bs):
1918 if not bs:
1919 return []
1920 if isinstance(bs[0], int): # Python 3
1921 return list(bs)
1922 else:
1923 return [ord(c) for c in bs]
1924
1925
1926 def intlist_to_bytes(xs):
1927 if not xs:
1928 return b''
1929 return compat_struct_pack('%dB' % len(xs), *xs)
1930
1931
1932 class LockingUnsupportedError(IOError):
1933 msg = 'File locking is not supported on this platform'
1934
1935 def __init__(self):
1936 super().__init__(self.msg)
1937
1938
1939 # Cross-platform file locking
1940 if sys.platform == 'win32':
1941 import ctypes.wintypes
1942 import msvcrt
1943
1944 class OVERLAPPED(ctypes.Structure):
1945 _fields_ = [
1946 ('Internal', ctypes.wintypes.LPVOID),
1947 ('InternalHigh', ctypes.wintypes.LPVOID),
1948 ('Offset', ctypes.wintypes.DWORD),
1949 ('OffsetHigh', ctypes.wintypes.DWORD),
1950 ('hEvent', ctypes.wintypes.HANDLE),
1951 ]
1952
1953 kernel32 = ctypes.windll.kernel32
1954 LockFileEx = kernel32.LockFileEx
1955 LockFileEx.argtypes = [
1956 ctypes.wintypes.HANDLE, # hFile
1957 ctypes.wintypes.DWORD, # dwFlags
1958 ctypes.wintypes.DWORD, # dwReserved
1959 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1960 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1961 ctypes.POINTER(OVERLAPPED) # Overlapped
1962 ]
1963 LockFileEx.restype = ctypes.wintypes.BOOL
1964 UnlockFileEx = kernel32.UnlockFileEx
1965 UnlockFileEx.argtypes = [
1966 ctypes.wintypes.HANDLE, # hFile
1967 ctypes.wintypes.DWORD, # dwReserved
1968 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1969 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1970 ctypes.POINTER(OVERLAPPED) # Overlapped
1971 ]
1972 UnlockFileEx.restype = ctypes.wintypes.BOOL
1973 whole_low = 0xffffffff
1974 whole_high = 0x7fffffff
1975
1976 def _lock_file(f, exclusive, block):
1977 overlapped = OVERLAPPED()
1978 overlapped.Offset = 0
1979 overlapped.OffsetHigh = 0
1980 overlapped.hEvent = 0
1981 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1982
1983 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1984 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1985 0, whole_low, whole_high, f._lock_file_overlapped_p):
1986 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
1987
1988 def _unlock_file(f):
1989 assert f._lock_file_overlapped_p
1990 handle = msvcrt.get_osfhandle(f.fileno())
1991 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1992 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1993
1994 else:
1995 try:
1996 import fcntl
1997
1998 def _lock_file(f, exclusive, block):
1999 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2000 if not block:
2001 flags |= fcntl.LOCK_NB
2002 try:
2003 fcntl.flock(f, flags)
2004 except BlockingIOError:
2005 raise
2006 except OSError: # AOSP does not have flock()
2007 fcntl.lockf(f, flags)
2008
2009 def _unlock_file(f):
2010 try:
2011 fcntl.flock(f, fcntl.LOCK_UN)
2012 except OSError:
2013 fcntl.lockf(f, fcntl.LOCK_UN)
2014
2015 except ImportError:
2016
2017 def _lock_file(f, exclusive, block):
2018 raise LockingUnsupportedError()
2019
2020 def _unlock_file(f):
2021 raise LockingUnsupportedError()
2022
2023
2024 class locked_file:
2025 locked = False
2026
2027 def __init__(self, filename, mode, block=True, encoding=None):
2028 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2029 raise NotImplementedError(mode)
2030 self.mode, self.block = mode, block
2031
2032 writable = any(f in mode for f in 'wax+')
2033 readable = any(f in mode for f in 'r+')
2034 flags = functools.reduce(operator.ior, (
2035 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2036 getattr(os, 'O_BINARY', 0), # Windows only
2037 getattr(os, 'O_NOINHERIT', 0), # Windows only
2038 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2039 os.O_APPEND if 'a' in mode else 0,
2040 os.O_EXCL if 'x' in mode else 0,
2041 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2042 ))
2043
2044 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2045
2046 def __enter__(self):
2047 exclusive = 'r' not in self.mode
2048 try:
2049 _lock_file(self.f, exclusive, self.block)
2050 self.locked = True
2051 except OSError:
2052 self.f.close()
2053 raise
2054 if 'w' in self.mode:
2055 try:
2056 self.f.truncate()
2057 except OSError as e:
2058 if e.errno != 29: # Illegal seek, expected when self.f is a FIFO
2059 raise e
2060 return self
2061
2062 def unlock(self):
2063 if not self.locked:
2064 return
2065 try:
2066 _unlock_file(self.f)
2067 finally:
2068 self.locked = False
2069
2070 def __exit__(self, *_):
2071 try:
2072 self.unlock()
2073 finally:
2074 self.f.close()
2075
2076 open = __enter__
2077 close = __exit__
2078
2079 def __getattr__(self, attr):
2080 return getattr(self.f, attr)
2081
2082 def __iter__(self):
2083 return iter(self.f)
2084
2085
2086 def get_filesystem_encoding():
2087 encoding = sys.getfilesystemencoding()
2088 return encoding if encoding is not None else 'utf-8'
2089
2090
2091 def shell_quote(args):
2092 quoted_args = []
2093 encoding = get_filesystem_encoding()
2094 for a in args:
2095 if isinstance(a, bytes):
2096 # We may get a filename encoded with 'encodeFilename'
2097 a = a.decode(encoding)
2098 quoted_args.append(compat_shlex_quote(a))
2099 return ' '.join(quoted_args)
2100
2101
2102 def smuggle_url(url, data):
2103 """ Pass additional data in a URL for internal use. """
2104
2105 url, idata = unsmuggle_url(url, {})
2106 data.update(idata)
2107 sdata = compat_urllib_parse_urlencode(
2108 {'__youtubedl_smuggle': json.dumps(data)})
2109 return url + '#' + sdata
2110
2111
2112 def unsmuggle_url(smug_url, default=None):
2113 if '#__youtubedl_smuggle' not in smug_url:
2114 return smug_url, default
2115 url, _, sdata = smug_url.rpartition('#')
2116 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2117 data = json.loads(jsond)
2118 return url, data
2119
2120
2121 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2122 """ Formats numbers with decimal sufixes like K, M, etc """
2123 num, factor = float_or_none(num), float(factor)
2124 if num is None or num < 0:
2125 return None
2126 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2127 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2128 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2129 if factor == 1024:
2130 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2131 converted = num / (factor ** exponent)
2132 return fmt % (converted, suffix)
2133
2134
2135 def format_bytes(bytes):
2136 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2137
2138
2139 def lookup_unit_table(unit_table, s):
2140 units_re = '|'.join(re.escape(u) for u in unit_table)
2141 m = re.match(
2142 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2143 if not m:
2144 return None
2145 num_str = m.group('num').replace(',', '.')
2146 mult = unit_table[m.group('unit')]
2147 return int(float(num_str) * mult)
2148
2149
2150 def parse_filesize(s):
2151 if s is None:
2152 return None
2153
2154 # The lower-case forms are of course incorrect and unofficial,
2155 # but we support those too
2156 _UNIT_TABLE = {
2157 'B': 1,
2158 'b': 1,
2159 'bytes': 1,
2160 'KiB': 1024,
2161 'KB': 1000,
2162 'kB': 1024,
2163 'Kb': 1000,
2164 'kb': 1000,
2165 'kilobytes': 1000,
2166 'kibibytes': 1024,
2167 'MiB': 1024 ** 2,
2168 'MB': 1000 ** 2,
2169 'mB': 1024 ** 2,
2170 'Mb': 1000 ** 2,
2171 'mb': 1000 ** 2,
2172 'megabytes': 1000 ** 2,
2173 'mebibytes': 1024 ** 2,
2174 'GiB': 1024 ** 3,
2175 'GB': 1000 ** 3,
2176 'gB': 1024 ** 3,
2177 'Gb': 1000 ** 3,
2178 'gb': 1000 ** 3,
2179 'gigabytes': 1000 ** 3,
2180 'gibibytes': 1024 ** 3,
2181 'TiB': 1024 ** 4,
2182 'TB': 1000 ** 4,
2183 'tB': 1024 ** 4,
2184 'Tb': 1000 ** 4,
2185 'tb': 1000 ** 4,
2186 'terabytes': 1000 ** 4,
2187 'tebibytes': 1024 ** 4,
2188 'PiB': 1024 ** 5,
2189 'PB': 1000 ** 5,
2190 'pB': 1024 ** 5,
2191 'Pb': 1000 ** 5,
2192 'pb': 1000 ** 5,
2193 'petabytes': 1000 ** 5,
2194 'pebibytes': 1024 ** 5,
2195 'EiB': 1024 ** 6,
2196 'EB': 1000 ** 6,
2197 'eB': 1024 ** 6,
2198 'Eb': 1000 ** 6,
2199 'eb': 1000 ** 6,
2200 'exabytes': 1000 ** 6,
2201 'exbibytes': 1024 ** 6,
2202 'ZiB': 1024 ** 7,
2203 'ZB': 1000 ** 7,
2204 'zB': 1024 ** 7,
2205 'Zb': 1000 ** 7,
2206 'zb': 1000 ** 7,
2207 'zettabytes': 1000 ** 7,
2208 'zebibytes': 1024 ** 7,
2209 'YiB': 1024 ** 8,
2210 'YB': 1000 ** 8,
2211 'yB': 1024 ** 8,
2212 'Yb': 1000 ** 8,
2213 'yb': 1000 ** 8,
2214 'yottabytes': 1000 ** 8,
2215 'yobibytes': 1024 ** 8,
2216 }
2217
2218 return lookup_unit_table(_UNIT_TABLE, s)
2219
2220
2221 def parse_count(s):
2222 if s is None:
2223 return None
2224
2225 s = re.sub(r'^[^\d]+\s', '', s).strip()
2226
2227 if re.match(r'^[\d,.]+$', s):
2228 return str_to_int(s)
2229
2230 _UNIT_TABLE = {
2231 'k': 1000,
2232 'K': 1000,
2233 'm': 1000 ** 2,
2234 'M': 1000 ** 2,
2235 'kk': 1000 ** 2,
2236 'KK': 1000 ** 2,
2237 'b': 1000 ** 3,
2238 'B': 1000 ** 3,
2239 }
2240
2241 ret = lookup_unit_table(_UNIT_TABLE, s)
2242 if ret is not None:
2243 return ret
2244
2245 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2246 if mobj:
2247 return str_to_int(mobj.group(1))
2248
2249
2250 def parse_resolution(s, *, lenient=False):
2251 if s is None:
2252 return {}
2253
2254 if lenient:
2255 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2256 else:
2257 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2258 if mobj:
2259 return {
2260 'width': int(mobj.group('w')),
2261 'height': int(mobj.group('h')),
2262 }
2263
2264 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2265 if mobj:
2266 return {'height': int(mobj.group(1))}
2267
2268 mobj = re.search(r'\b([48])[kK]\b', s)
2269 if mobj:
2270 return {'height': int(mobj.group(1)) * 540}
2271
2272 return {}
2273
2274
2275 def parse_bitrate(s):
2276 if not isinstance(s, compat_str):
2277 return
2278 mobj = re.search(r'\b(\d+)\s*kbps', s)
2279 if mobj:
2280 return int(mobj.group(1))
2281
2282
2283 def month_by_name(name, lang='en'):
2284 """ Return the number of a month by (locale-independently) English name """
2285
2286 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2287
2288 try:
2289 return month_names.index(name) + 1
2290 except ValueError:
2291 return None
2292
2293
2294 def month_by_abbreviation(abbrev):
2295 """ Return the number of a month by (locale-independently) English
2296 abbreviations """
2297
2298 try:
2299 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2300 except ValueError:
2301 return None
2302
2303
2304 def fix_xml_ampersands(xml_str):
2305 """Replace all the '&' by '&amp;' in XML"""
2306 return re.sub(
2307 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2308 '&amp;',
2309 xml_str)
2310
2311
2312 def setproctitle(title):
2313 assert isinstance(title, compat_str)
2314
2315 # ctypes in Jython is not complete
2316 # http://bugs.jython.org/issue2148
2317 if sys.platform.startswith('java'):
2318 return
2319
2320 try:
2321 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2322 except OSError:
2323 return
2324 except TypeError:
2325 # LoadLibrary in Windows Python 2.7.13 only expects
2326 # a bytestring, but since unicode_literals turns
2327 # every string into a unicode string, it fails.
2328 return
2329 title_bytes = title.encode()
2330 buf = ctypes.create_string_buffer(len(title_bytes))
2331 buf.value = title_bytes
2332 try:
2333 libc.prctl(15, buf, 0, 0, 0)
2334 except AttributeError:
2335 return # Strange libc, just skip this
2336
2337
2338 def remove_start(s, start):
2339 return s[len(start):] if s is not None and s.startswith(start) else s
2340
2341
2342 def remove_end(s, end):
2343 return s[:-len(end)] if s is not None and s.endswith(end) else s
2344
2345
2346 def remove_quotes(s):
2347 if s is None or len(s) < 2:
2348 return s
2349 for quote in ('"', "'", ):
2350 if s[0] == quote and s[-1] == quote:
2351 return s[1:-1]
2352 return s
2353
2354
2355 def get_domain(url):
2356 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2357 return domain.group('domain') if domain else None
2358
2359
2360 def url_basename(url):
2361 path = compat_urlparse.urlparse(url).path
2362 return path.strip('/').split('/')[-1]
2363
2364
2365 def base_url(url):
2366 return re.match(r'https?://[^?#&]+/', url).group()
2367
2368
2369 def urljoin(base, path):
2370 if isinstance(path, bytes):
2371 path = path.decode()
2372 if not isinstance(path, compat_str) or not path:
2373 return None
2374 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2375 return path
2376 if isinstance(base, bytes):
2377 base = base.decode()
2378 if not isinstance(base, compat_str) or not re.match(
2379 r'^(?:https?:)?//', base):
2380 return None
2381 return compat_urlparse.urljoin(base, path)
2382
2383
2384 class HEADRequest(compat_urllib_request.Request):
2385 def get_method(self):
2386 return 'HEAD'
2387
2388
2389 class PUTRequest(compat_urllib_request.Request):
2390 def get_method(self):
2391 return 'PUT'
2392
2393
2394 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2395 if get_attr and v is not None:
2396 v = getattr(v, get_attr, None)
2397 try:
2398 return int(v) * invscale // scale
2399 except (ValueError, TypeError, OverflowError):
2400 return default
2401
2402
2403 def str_or_none(v, default=None):
2404 return default if v is None else compat_str(v)
2405
2406
2407 def str_to_int(int_str):
2408 """ A more relaxed version of int_or_none """
2409 if isinstance(int_str, int):
2410 return int_str
2411 elif isinstance(int_str, compat_str):
2412 int_str = re.sub(r'[,\.\+]', '', int_str)
2413 return int_or_none(int_str)
2414
2415
2416 def float_or_none(v, scale=1, invscale=1, default=None):
2417 if v is None:
2418 return default
2419 try:
2420 return float(v) * invscale / scale
2421 except (ValueError, TypeError):
2422 return default
2423
2424
2425 def bool_or_none(v, default=None):
2426 return v if isinstance(v, bool) else default
2427
2428
2429 def strip_or_none(v, default=None):
2430 return v.strip() if isinstance(v, compat_str) else default
2431
2432
2433 def url_or_none(url):
2434 if not url or not isinstance(url, compat_str):
2435 return None
2436 url = url.strip()
2437 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2438
2439
2440 def request_to_url(req):
2441 if isinstance(req, compat_urllib_request.Request):
2442 return req.get_full_url()
2443 else:
2444 return req
2445
2446
2447 def strftime_or_none(timestamp, date_format, default=None):
2448 datetime_object = None
2449 try:
2450 if isinstance(timestamp, (int, float)): # unix timestamp
2451 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2452 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2453 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2454 return datetime_object.strftime(date_format)
2455 except (ValueError, TypeError, AttributeError):
2456 return default
2457
2458
2459 def parse_duration(s):
2460 if not isinstance(s, str):
2461 return None
2462 s = s.strip()
2463 if not s:
2464 return None
2465
2466 days, hours, mins, secs, ms = [None] * 5
2467 m = re.match(r'''(?x)
2468 (?P<before_secs>
2469 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2470 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2471 (?P<ms>[.:][0-9]+)?Z?$
2472 ''', s)
2473 if m:
2474 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2475 else:
2476 m = re.match(
2477 r'''(?ix)(?:P?
2478 (?:
2479 [0-9]+\s*y(?:ears?)?,?\s*
2480 )?
2481 (?:
2482 [0-9]+\s*m(?:onths?)?,?\s*
2483 )?
2484 (?:
2485 [0-9]+\s*w(?:eeks?)?,?\s*
2486 )?
2487 (?:
2488 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2489 )?
2490 T)?
2491 (?:
2492 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2493 )?
2494 (?:
2495 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2496 )?
2497 (?:
2498 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2499 )?Z?$''', s)
2500 if m:
2501 days, hours, mins, secs, ms = m.groups()
2502 else:
2503 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2504 if m:
2505 hours, mins = m.groups()
2506 else:
2507 return None
2508
2509 if ms:
2510 ms = ms.replace(':', '.')
2511 return sum(float(part or 0) * mult for part, mult in (
2512 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2513
2514
2515 def prepend_extension(filename, ext, expected_real_ext=None):
2516 name, real_ext = os.path.splitext(filename)
2517 return (
2518 f'{name}.{ext}{real_ext}'
2519 if not expected_real_ext or real_ext[1:] == expected_real_ext
2520 else f'{filename}.{ext}')
2521
2522
2523 def replace_extension(filename, ext, expected_real_ext=None):
2524 name, real_ext = os.path.splitext(filename)
2525 return '{}.{}'.format(
2526 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2527 ext)
2528
2529
2530 def check_executable(exe, args=[]):
2531 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2532 args can be a list of arguments for a short output (like -version) """
2533 try:
2534 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2535 except OSError:
2536 return False
2537 return exe
2538
2539
2540 def _get_exe_version_output(exe, args, *, to_screen=None):
2541 if to_screen:
2542 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2543 try:
2544 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2545 # SIGTTOU if yt-dlp is run in the background.
2546 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2547 out, _ = Popen(
2548 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2549 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2550 except OSError:
2551 return False
2552 if isinstance(out, bytes): # Python 2.x
2553 out = out.decode('ascii', 'ignore')
2554 return out
2555
2556
2557 def detect_exe_version(output, version_re=None, unrecognized='present'):
2558 assert isinstance(output, compat_str)
2559 if version_re is None:
2560 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2561 m = re.search(version_re, output)
2562 if m:
2563 return m.group(1)
2564 else:
2565 return unrecognized
2566
2567
2568 def get_exe_version(exe, args=['--version'],
2569 version_re=None, unrecognized='present'):
2570 """ Returns the version of the specified executable,
2571 or False if the executable is not present """
2572 out = _get_exe_version_output(exe, args)
2573 return detect_exe_version(out, version_re, unrecognized) if out else False
2574
2575
2576 class LazyList(collections.abc.Sequence):
2577 """Lazy immutable list from an iterable
2578 Note that slices of a LazyList are lists and not LazyList"""
2579
2580 class IndexError(IndexError):
2581 pass
2582
2583 def __init__(self, iterable, *, reverse=False, _cache=None):
2584 self._iterable = iter(iterable)
2585 self._cache = [] if _cache is None else _cache
2586 self._reversed = reverse
2587
2588 def __iter__(self):
2589 if self._reversed:
2590 # We need to consume the entire iterable to iterate in reverse
2591 yield from self.exhaust()
2592 return
2593 yield from self._cache
2594 for item in self._iterable:
2595 self._cache.append(item)
2596 yield item
2597
2598 def _exhaust(self):
2599 self._cache.extend(self._iterable)
2600 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2601 return self._cache
2602
2603 def exhaust(self):
2604 """Evaluate the entire iterable"""
2605 return self._exhaust()[::-1 if self._reversed else 1]
2606
2607 @staticmethod
2608 def _reverse_index(x):
2609 return None if x is None else -(x + 1)
2610
2611 def __getitem__(self, idx):
2612 if isinstance(idx, slice):
2613 if self._reversed:
2614 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2615 start, stop, step = idx.start, idx.stop, idx.step or 1
2616 elif isinstance(idx, int):
2617 if self._reversed:
2618 idx = self._reverse_index(idx)
2619 start, stop, step = idx, idx, 0
2620 else:
2621 raise TypeError('indices must be integers or slices')
2622 if ((start or 0) < 0 or (stop or 0) < 0
2623 or (start is None and step < 0)
2624 or (stop is None and step > 0)):
2625 # We need to consume the entire iterable to be able to slice from the end
2626 # Obviously, never use this with infinite iterables
2627 self._exhaust()
2628 try:
2629 return self._cache[idx]
2630 except IndexError as e:
2631 raise self.IndexError(e) from e
2632 n = max(start or 0, stop or 0) - len(self._cache) + 1
2633 if n > 0:
2634 self._cache.extend(itertools.islice(self._iterable, n))
2635 try:
2636 return self._cache[idx]
2637 except IndexError as e:
2638 raise self.IndexError(e) from e
2639
2640 def __bool__(self):
2641 try:
2642 self[-1] if self._reversed else self[0]
2643 except self.IndexError:
2644 return False
2645 return True
2646
2647 def __len__(self):
2648 self._exhaust()
2649 return len(self._cache)
2650
2651 def __reversed__(self):
2652 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2653
2654 def __copy__(self):
2655 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2656
2657 def __repr__(self):
2658 # repr and str should mimic a list. So we exhaust the iterable
2659 return repr(self.exhaust())
2660
2661 def __str__(self):
2662 return repr(self.exhaust())
2663
2664
2665 class PagedList:
2666
2667 class IndexError(IndexError):
2668 pass
2669
2670 def __len__(self):
2671 # This is only useful for tests
2672 return len(self.getslice())
2673
2674 def __init__(self, pagefunc, pagesize, use_cache=True):
2675 self._pagefunc = pagefunc
2676 self._pagesize = pagesize
2677 self._pagecount = float('inf')
2678 self._use_cache = use_cache
2679 self._cache = {}
2680
2681 def getpage(self, pagenum):
2682 page_results = self._cache.get(pagenum)
2683 if page_results is None:
2684 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2685 if self._use_cache:
2686 self._cache[pagenum] = page_results
2687 return page_results
2688
2689 def getslice(self, start=0, end=None):
2690 return list(self._getslice(start, end))
2691
2692 def _getslice(self, start, end):
2693 raise NotImplementedError('This method must be implemented by subclasses')
2694
2695 def __getitem__(self, idx):
2696 assert self._use_cache, 'Indexing PagedList requires cache'
2697 if not isinstance(idx, int) or idx < 0:
2698 raise TypeError('indices must be non-negative integers')
2699 entries = self.getslice(idx, idx + 1)
2700 if not entries:
2701 raise self.IndexError()
2702 return entries[0]
2703
2704
2705 class OnDemandPagedList(PagedList):
2706 """Download pages until a page with less than maximum results"""
2707
2708 def _getslice(self, start, end):
2709 for pagenum in itertools.count(start // self._pagesize):
2710 firstid = pagenum * self._pagesize
2711 nextfirstid = pagenum * self._pagesize + self._pagesize
2712 if start >= nextfirstid:
2713 continue
2714
2715 startv = (
2716 start % self._pagesize
2717 if firstid <= start < nextfirstid
2718 else 0)
2719 endv = (
2720 ((end - 1) % self._pagesize) + 1
2721 if (end is not None and firstid <= end <= nextfirstid)
2722 else None)
2723
2724 try:
2725 page_results = self.getpage(pagenum)
2726 except Exception:
2727 self._pagecount = pagenum - 1
2728 raise
2729 if startv != 0 or endv is not None:
2730 page_results = page_results[startv:endv]
2731 yield from page_results
2732
2733 # A little optimization - if current page is not "full", ie. does
2734 # not contain page_size videos then we can assume that this page
2735 # is the last one - there are no more ids on further pages -
2736 # i.e. no need to query again.
2737 if len(page_results) + startv < self._pagesize:
2738 break
2739
2740 # If we got the whole page, but the next page is not interesting,
2741 # break out early as well
2742 if end == nextfirstid:
2743 break
2744
2745
2746 class InAdvancePagedList(PagedList):
2747 """PagedList with total number of pages known in advance"""
2748
2749 def __init__(self, pagefunc, pagecount, pagesize):
2750 PagedList.__init__(self, pagefunc, pagesize, True)
2751 self._pagecount = pagecount
2752
2753 def _getslice(self, start, end):
2754 start_page = start // self._pagesize
2755 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2756 skip_elems = start - start_page * self._pagesize
2757 only_more = None if end is None else end - start
2758 for pagenum in range(start_page, end_page):
2759 page_results = self.getpage(pagenum)
2760 if skip_elems:
2761 page_results = page_results[skip_elems:]
2762 skip_elems = None
2763 if only_more is not None:
2764 if len(page_results) < only_more:
2765 only_more -= len(page_results)
2766 else:
2767 yield from page_results[:only_more]
2768 break
2769 yield from page_results
2770
2771
2772 def uppercase_escape(s):
2773 unicode_escape = codecs.getdecoder('unicode_escape')
2774 return re.sub(
2775 r'\\U[0-9a-fA-F]{8}',
2776 lambda m: unicode_escape(m.group(0))[0],
2777 s)
2778
2779
2780 def lowercase_escape(s):
2781 unicode_escape = codecs.getdecoder('unicode_escape')
2782 return re.sub(
2783 r'\\u[0-9a-fA-F]{4}',
2784 lambda m: unicode_escape(m.group(0))[0],
2785 s)
2786
2787
2788 def escape_rfc3986(s):
2789 """Escape non-ASCII characters as suggested by RFC 3986"""
2790 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2791
2792
2793 def escape_url(url):
2794 """Escape URL as suggested by RFC 3986"""
2795 url_parsed = compat_urllib_parse_urlparse(url)
2796 return url_parsed._replace(
2797 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2798 path=escape_rfc3986(url_parsed.path),
2799 params=escape_rfc3986(url_parsed.params),
2800 query=escape_rfc3986(url_parsed.query),
2801 fragment=escape_rfc3986(url_parsed.fragment)
2802 ).geturl()
2803
2804
2805 def parse_qs(url):
2806 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2807
2808
2809 def read_batch_urls(batch_fd):
2810 def fixup(url):
2811 if not isinstance(url, compat_str):
2812 url = url.decode('utf-8', 'replace')
2813 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2814 for bom in BOM_UTF8:
2815 if url.startswith(bom):
2816 url = url[len(bom):]
2817 url = url.lstrip()
2818 if not url or url.startswith(('#', ';', ']')):
2819 return False
2820 # "#" cannot be stripped out since it is part of the URI
2821 # However, it can be safely stipped out if follwing a whitespace
2822 return re.split(r'\s#', url, 1)[0].rstrip()
2823
2824 with contextlib.closing(batch_fd) as fd:
2825 return [url for url in map(fixup, fd) if url]
2826
2827
2828 def urlencode_postdata(*args, **kargs):
2829 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2830
2831
2832 def update_url_query(url, query):
2833 if not query:
2834 return url
2835 parsed_url = compat_urlparse.urlparse(url)
2836 qs = compat_parse_qs(parsed_url.query)
2837 qs.update(query)
2838 return compat_urlparse.urlunparse(parsed_url._replace(
2839 query=compat_urllib_parse_urlencode(qs, True)))
2840
2841
2842 def update_Request(req, url=None, data=None, headers={}, query={}):
2843 req_headers = req.headers.copy()
2844 req_headers.update(headers)
2845 req_data = data or req.data
2846 req_url = update_url_query(url or req.get_full_url(), query)
2847 req_get_method = req.get_method()
2848 if req_get_method == 'HEAD':
2849 req_type = HEADRequest
2850 elif req_get_method == 'PUT':
2851 req_type = PUTRequest
2852 else:
2853 req_type = compat_urllib_request.Request
2854 new_req = req_type(
2855 req_url, data=req_data, headers=req_headers,
2856 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2857 if hasattr(req, 'timeout'):
2858 new_req.timeout = req.timeout
2859 return new_req
2860
2861
2862 def _multipart_encode_impl(data, boundary):
2863 content_type = 'multipart/form-data; boundary=%s' % boundary
2864
2865 out = b''
2866 for k, v in data.items():
2867 out += b'--' + boundary.encode('ascii') + b'\r\n'
2868 if isinstance(k, compat_str):
2869 k = k.encode()
2870 if isinstance(v, compat_str):
2871 v = v.encode()
2872 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2873 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2874 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2875 if boundary.encode('ascii') in content:
2876 raise ValueError('Boundary overlaps with data')
2877 out += content
2878
2879 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2880
2881 return out, content_type
2882
2883
2884 def multipart_encode(data, boundary=None):
2885 '''
2886 Encode a dict to RFC 7578-compliant form-data
2887
2888 data:
2889 A dict where keys and values can be either Unicode or bytes-like
2890 objects.
2891 boundary:
2892 If specified a Unicode object, it's used as the boundary. Otherwise
2893 a random boundary is generated.
2894
2895 Reference: https://tools.ietf.org/html/rfc7578
2896 '''
2897 has_specified_boundary = boundary is not None
2898
2899 while True:
2900 if boundary is None:
2901 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2902
2903 try:
2904 out, content_type = _multipart_encode_impl(data, boundary)
2905 break
2906 except ValueError:
2907 if has_specified_boundary:
2908 raise
2909 boundary = None
2910
2911 return out, content_type
2912
2913
2914 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2915 for val in map(d.get, variadic(key_or_keys)):
2916 if val is not None and (val or not skip_false_values):
2917 return val
2918 return default
2919
2920
2921 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2922 for f in funcs:
2923 try:
2924 val = f(*args, **kwargs)
2925 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2926 pass
2927 else:
2928 if expected_type is None or isinstance(val, expected_type):
2929 return val
2930
2931
2932 def try_get(src, getter, expected_type=None):
2933 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2934
2935
2936 def filter_dict(dct, cndn=lambda _, v: v is not None):
2937 return {k: v for k, v in dct.items() if cndn(k, v)}
2938
2939
2940 def merge_dicts(*dicts):
2941 merged = {}
2942 for a_dict in dicts:
2943 for k, v in a_dict.items():
2944 if (v is not None and k not in merged
2945 or isinstance(v, str) and merged[k] == ''):
2946 merged[k] = v
2947 return merged
2948
2949
2950 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2951 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2952
2953
2954 US_RATINGS = {
2955 'G': 0,
2956 'PG': 10,
2957 'PG-13': 13,
2958 'R': 16,
2959 'NC': 18,
2960 }
2961
2962
2963 TV_PARENTAL_GUIDELINES = {
2964 'TV-Y': 0,
2965 'TV-Y7': 7,
2966 'TV-G': 0,
2967 'TV-PG': 0,
2968 'TV-14': 14,
2969 'TV-MA': 17,
2970 }
2971
2972
2973 def parse_age_limit(s):
2974 # isinstance(False, int) is True. So type() must be used instead
2975 if type(s) is int:
2976 return s if 0 <= s <= 21 else None
2977 elif not isinstance(s, str):
2978 return None
2979 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2980 if m:
2981 return int(m.group('age'))
2982 s = s.upper()
2983 if s in US_RATINGS:
2984 return US_RATINGS[s]
2985 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2986 if m:
2987 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2988 return None
2989
2990
2991 def strip_jsonp(code):
2992 return re.sub(
2993 r'''(?sx)^
2994 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2995 (?:\s*&&\s*(?P=func_name))?
2996 \s*\(\s*(?P<callback_data>.*)\);?
2997 \s*?(?://[^\n]*)*$''',
2998 r'\g<callback_data>', code)
2999
3000
3001 def js_to_json(code, vars={}):
3002 # vars is a dict of var, val pairs to substitute
3003 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3004 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3005 INTEGER_TABLE = (
3006 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3007 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3008 )
3009
3010 def fix_kv(m):
3011 v = m.group(0)
3012 if v in ('true', 'false', 'null'):
3013 return v
3014 elif v in ('undefined', 'void 0'):
3015 return 'null'
3016 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3017 return ""
3018
3019 if v[0] in ("'", '"'):
3020 v = re.sub(r'(?s)\\.|"', lambda m: {
3021 '"': '\\"',
3022 "\\'": "'",
3023 '\\\n': '',
3024 '\\x': '\\u00',
3025 }.get(m.group(0), m.group(0)), v[1:-1])
3026 else:
3027 for regex, base in INTEGER_TABLE:
3028 im = re.match(regex, v)
3029 if im:
3030 i = int(im.group(1), base)
3031 return '"%d":' % i if v.endswith(':') else '%d' % i
3032
3033 if v in vars:
3034 return vars[v]
3035
3036 return '"%s"' % v
3037
3038 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3039
3040 return re.sub(r'''(?sx)
3041 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3042 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3043 {comment}|,(?={skip}[\]}}])|
3044 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3045 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3046 [0-9]+(?={skip}:)|
3047 !+
3048 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3049
3050
3051 def qualities(quality_ids):
3052 """ Get a numeric quality value out of a list of possible values """
3053 def q(qid):
3054 try:
3055 return quality_ids.index(qid)
3056 except ValueError:
3057 return -1
3058 return q
3059
3060
3061 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3062
3063
3064 DEFAULT_OUTTMPL = {
3065 'default': '%(title)s [%(id)s].%(ext)s',
3066 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3067 }
3068 OUTTMPL_TYPES = {
3069 'chapter': None,
3070 'subtitle': None,
3071 'thumbnail': None,
3072 'description': 'description',
3073 'annotation': 'annotations.xml',
3074 'infojson': 'info.json',
3075 'link': None,
3076 'pl_video': None,
3077 'pl_thumbnail': None,
3078 'pl_description': 'description',
3079 'pl_infojson': 'info.json',
3080 }
3081
3082 # As of [1] format syntax is:
3083 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3084 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3085 STR_FORMAT_RE_TMPL = r'''(?x)
3086 (?<!%)(?P<prefix>(?:%%)*)
3087 %
3088 (?P<has_key>\((?P<key>{0})\))?
3089 (?P<format>
3090 (?P<conversion>[#0\-+ ]+)?
3091 (?P<min_width>\d+)?
3092 (?P<precision>\.\d+)?
3093 (?P<len_mod>[hlL])? # unused in python
3094 {1} # conversion type
3095 )
3096 '''
3097
3098
3099 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3100
3101
3102 def limit_length(s, length):
3103 """ Add ellipses to overly long strings """
3104 if s is None:
3105 return None
3106 ELLIPSES = '...'
3107 if len(s) > length:
3108 return s[:length - len(ELLIPSES)] + ELLIPSES
3109 return s
3110
3111
3112 def version_tuple(v):
3113 return tuple(int(e) for e in re.split(r'[-.]', v))
3114
3115
3116 def is_outdated_version(version, limit, assume_new=True):
3117 if not version:
3118 return not assume_new
3119 try:
3120 return version_tuple(version) < version_tuple(limit)
3121 except ValueError:
3122 return not assume_new
3123
3124
3125 def ytdl_is_updateable():
3126 """ Returns if yt-dlp can be updated with -U """
3127
3128 from .update import is_non_updateable
3129
3130 return not is_non_updateable()
3131
3132
3133 def args_to_str(args):
3134 # Get a short string representation for a subprocess command
3135 return ' '.join(compat_shlex_quote(a) for a in args)
3136
3137
3138 def error_to_compat_str(err):
3139 return str(err)
3140
3141
3142 def error_to_str(err):
3143 return f'{type(err).__name__}: {err}'
3144
3145
3146 def mimetype2ext(mt):
3147 if mt is None:
3148 return None
3149
3150 mt, _, params = mt.partition(';')
3151 mt = mt.strip()
3152
3153 FULL_MAP = {
3154 'audio/mp4': 'm4a',
3155 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3156 # it's the most popular one
3157 'audio/mpeg': 'mp3',
3158 'audio/x-wav': 'wav',
3159 'audio/wav': 'wav',
3160 'audio/wave': 'wav',
3161 }
3162
3163 ext = FULL_MAP.get(mt)
3164 if ext is not None:
3165 return ext
3166
3167 SUBTYPE_MAP = {
3168 '3gpp': '3gp',
3169 'smptett+xml': 'tt',
3170 'ttaf+xml': 'dfxp',
3171 'ttml+xml': 'ttml',
3172 'x-flv': 'flv',
3173 'x-mp4-fragmented': 'mp4',
3174 'x-ms-sami': 'sami',
3175 'x-ms-wmv': 'wmv',
3176 'mpegurl': 'm3u8',
3177 'x-mpegurl': 'm3u8',
3178 'vnd.apple.mpegurl': 'm3u8',
3179 'dash+xml': 'mpd',
3180 'f4m+xml': 'f4m',
3181 'hds+xml': 'f4m',
3182 'vnd.ms-sstr+xml': 'ism',
3183 'quicktime': 'mov',
3184 'mp2t': 'ts',
3185 'x-wav': 'wav',
3186 'filmstrip+json': 'fs',
3187 'svg+xml': 'svg',
3188 }
3189
3190 _, _, subtype = mt.rpartition('/')
3191 ext = SUBTYPE_MAP.get(subtype.lower())
3192 if ext is not None:
3193 return ext
3194
3195 SUFFIX_MAP = {
3196 'json': 'json',
3197 'xml': 'xml',
3198 'zip': 'zip',
3199 'gzip': 'gz',
3200 }
3201
3202 _, _, suffix = subtype.partition('+')
3203 ext = SUFFIX_MAP.get(suffix)
3204 if ext is not None:
3205 return ext
3206
3207 return subtype.replace('+', '.')
3208
3209
3210 def ext2mimetype(ext_or_url):
3211 if not ext_or_url:
3212 return None
3213 if '.' not in ext_or_url:
3214 ext_or_url = f'file.{ext_or_url}'
3215 return mimetypes.guess_type(ext_or_url)[0]
3216
3217
3218 def parse_codecs(codecs_str):
3219 # http://tools.ietf.org/html/rfc6381
3220 if not codecs_str:
3221 return {}
3222 split_codecs = list(filter(None, map(
3223 str.strip, codecs_str.strip().strip(',').split(','))))
3224 vcodec, acodec, scodec, hdr = None, None, None, None
3225 for full_codec in split_codecs:
3226 parts = full_codec.split('.')
3227 codec = parts[0].replace('0', '')
3228 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3229 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3230 if not vcodec:
3231 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3232 if codec in ('dvh1', 'dvhe'):
3233 hdr = 'DV'
3234 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3235 hdr = 'HDR10'
3236 elif full_codec.replace('0', '').startswith('vp9.2'):
3237 hdr = 'HDR10'
3238 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3239 if not acodec:
3240 acodec = full_codec
3241 elif codec in ('stpp', 'wvtt',):
3242 if not scodec:
3243 scodec = full_codec
3244 else:
3245 write_string(f'WARNING: Unknown codec {full_codec}\n')
3246 if vcodec or acodec or scodec:
3247 return {
3248 'vcodec': vcodec or 'none',
3249 'acodec': acodec or 'none',
3250 'dynamic_range': hdr,
3251 **({'scodec': scodec} if scodec is not None else {}),
3252 }
3253 elif len(split_codecs) == 2:
3254 return {
3255 'vcodec': split_codecs[0],
3256 'acodec': split_codecs[1],
3257 }
3258 return {}
3259
3260
3261 def urlhandle_detect_ext(url_handle):
3262 getheader = url_handle.headers.get
3263
3264 cd = getheader('Content-Disposition')
3265 if cd:
3266 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3267 if m:
3268 e = determine_ext(m.group('filename'), default_ext=None)
3269 if e:
3270 return e
3271
3272 return mimetype2ext(getheader('Content-Type'))
3273
3274
3275 def encode_data_uri(data, mime_type):
3276 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3277
3278
3279 def age_restricted(content_limit, age_limit):
3280 """ Returns True iff the content should be blocked """
3281
3282 if age_limit is None: # No limit set
3283 return False
3284 if content_limit is None:
3285 return False # Content available for everyone
3286 return age_limit < content_limit
3287
3288
3289 def is_html(first_bytes):
3290 """ Detect whether a file contains HTML by examining its first bytes. """
3291
3292 BOMS = [
3293 (b'\xef\xbb\xbf', 'utf-8'),
3294 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3295 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3296 (b'\xff\xfe', 'utf-16-le'),
3297 (b'\xfe\xff', 'utf-16-be'),
3298 ]
3299 for bom, enc in BOMS:
3300 if first_bytes.startswith(bom):
3301 s = first_bytes[len(bom):].decode(enc, 'replace')
3302 break
3303 else:
3304 s = first_bytes.decode('utf-8', 'replace')
3305
3306 return re.match(r'^\s*<', s)
3307
3308
3309 def determine_protocol(info_dict):
3310 protocol = info_dict.get('protocol')
3311 if protocol is not None:
3312 return protocol
3313
3314 url = sanitize_url(info_dict['url'])
3315 if url.startswith('rtmp'):
3316 return 'rtmp'
3317 elif url.startswith('mms'):
3318 return 'mms'
3319 elif url.startswith('rtsp'):
3320 return 'rtsp'
3321
3322 ext = determine_ext(url)
3323 if ext == 'm3u8':
3324 return 'm3u8'
3325 elif ext == 'f4m':
3326 return 'f4m'
3327
3328 return compat_urllib_parse_urlparse(url).scheme
3329
3330
3331 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3332 """ Render a list of rows, each as a list of values.
3333 Text after a \t will be right aligned """
3334 def width(string):
3335 return len(remove_terminal_sequences(string).replace('\t', ''))
3336
3337 def get_max_lens(table):
3338 return [max(width(str(v)) for v in col) for col in zip(*table)]
3339
3340 def filter_using_list(row, filterArray):
3341 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3342
3343 max_lens = get_max_lens(data) if hide_empty else []
3344 header_row = filter_using_list(header_row, max_lens)
3345 data = [filter_using_list(row, max_lens) for row in data]
3346
3347 table = [header_row] + data
3348 max_lens = get_max_lens(table)
3349 extra_gap += 1
3350 if delim:
3351 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3352 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3353 for row in table:
3354 for pos, text in enumerate(map(str, row)):
3355 if '\t' in text:
3356 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3357 else:
3358 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3359 ret = '\n'.join(''.join(row).rstrip() for row in table)
3360 return ret
3361
3362
3363 def _match_one(filter_part, dct, incomplete):
3364 # TODO: Generalize code with YoutubeDL._build_format_filter
3365 STRING_OPERATORS = {
3366 '*=': operator.contains,
3367 '^=': lambda attr, value: attr.startswith(value),
3368 '$=': lambda attr, value: attr.endswith(value),
3369 '~=': lambda attr, value: re.search(value, attr),
3370 }
3371 COMPARISON_OPERATORS = {
3372 **STRING_OPERATORS,
3373 '<=': operator.le, # "<=" must be defined above "<"
3374 '<': operator.lt,
3375 '>=': operator.ge,
3376 '>': operator.gt,
3377 '=': operator.eq,
3378 }
3379
3380 if isinstance(incomplete, bool):
3381 is_incomplete = lambda _: incomplete
3382 else:
3383 is_incomplete = lambda k: k in incomplete
3384
3385 operator_rex = re.compile(r'''(?x)\s*
3386 (?P<key>[a-z_]+)
3387 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3388 (?:
3389 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3390 (?P<strval>.+?)
3391 )
3392 \s*$
3393 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3394 m = operator_rex.search(filter_part)
3395 if m:
3396 m = m.groupdict()
3397 unnegated_op = COMPARISON_OPERATORS[m['op']]
3398 if m['negation']:
3399 op = lambda attr, value: not unnegated_op(attr, value)
3400 else:
3401 op = unnegated_op
3402 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3403 if m['quote']:
3404 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3405 actual_value = dct.get(m['key'])
3406 numeric_comparison = None
3407 if isinstance(actual_value, (int, float)):
3408 # If the original field is a string and matching comparisonvalue is
3409 # a number we should respect the origin of the original field
3410 # and process comparison value as a string (see
3411 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3412 try:
3413 numeric_comparison = int(comparison_value)
3414 except ValueError:
3415 numeric_comparison = parse_filesize(comparison_value)
3416 if numeric_comparison is None:
3417 numeric_comparison = parse_filesize(f'{comparison_value}B')
3418 if numeric_comparison is None:
3419 numeric_comparison = parse_duration(comparison_value)
3420 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3421 raise ValueError('Operator %s only supports string values!' % m['op'])
3422 if actual_value is None:
3423 return is_incomplete(m['key']) or m['none_inclusive']
3424 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3425
3426 UNARY_OPERATORS = {
3427 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3428 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3429 }
3430 operator_rex = re.compile(r'''(?x)\s*
3431 (?P<op>%s)\s*(?P<key>[a-z_]+)
3432 \s*$
3433 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3434 m = operator_rex.search(filter_part)
3435 if m:
3436 op = UNARY_OPERATORS[m.group('op')]
3437 actual_value = dct.get(m.group('key'))
3438 if is_incomplete(m.group('key')) and actual_value is None:
3439 return True
3440 return op(actual_value)
3441
3442 raise ValueError('Invalid filter part %r' % filter_part)
3443
3444
3445 def match_str(filter_str, dct, incomplete=False):
3446 """ Filter a dictionary with a simple string syntax.
3447 @returns Whether the filter passes
3448 @param incomplete Set of keys that is expected to be missing from dct.
3449 Can be True/False to indicate all/none of the keys may be missing.
3450 All conditions on incomplete keys pass if the key is missing
3451 """
3452 return all(
3453 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3454 for filter_part in re.split(r'(?<!\\)&', filter_str))
3455
3456
3457 def match_filter_func(filters):
3458 if not filters:
3459 return None
3460 filters = set(variadic(filters))
3461
3462 interactive = '-' in filters
3463 if interactive:
3464 filters.remove('-')
3465
3466 def _match_func(info_dict, incomplete=False):
3467 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3468 return NO_DEFAULT if interactive and not incomplete else None
3469 else:
3470 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3471 filter_str = ') | ('.join(map(str.strip, filters))
3472 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3473 return _match_func
3474
3475
3476 def parse_dfxp_time_expr(time_expr):
3477 if not time_expr:
3478 return
3479
3480 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3481 if mobj:
3482 return float(mobj.group('time_offset'))
3483
3484 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3485 if mobj:
3486 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3487
3488
3489 def srt_subtitles_timecode(seconds):
3490 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3491
3492
3493 def ass_subtitles_timecode(seconds):
3494 time = timetuple_from_msec(seconds * 1000)
3495 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3496
3497
3498 def dfxp2srt(dfxp_data):
3499 '''
3500 @param dfxp_data A bytes-like object containing DFXP data
3501 @returns A unicode object containing converted SRT data
3502 '''
3503 LEGACY_NAMESPACES = (
3504 (b'http://www.w3.org/ns/ttml', [
3505 b'http://www.w3.org/2004/11/ttaf1',
3506 b'http://www.w3.org/2006/04/ttaf1',
3507 b'http://www.w3.org/2006/10/ttaf1',
3508 ]),
3509 (b'http://www.w3.org/ns/ttml#styling', [
3510 b'http://www.w3.org/ns/ttml#style',
3511 ]),
3512 )
3513
3514 SUPPORTED_STYLING = [
3515 'color',
3516 'fontFamily',
3517 'fontSize',
3518 'fontStyle',
3519 'fontWeight',
3520 'textDecoration'
3521 ]
3522
3523 _x = functools.partial(xpath_with_ns, ns_map={
3524 'xml': 'http://www.w3.org/XML/1998/namespace',
3525 'ttml': 'http://www.w3.org/ns/ttml',
3526 'tts': 'http://www.w3.org/ns/ttml#styling',
3527 })
3528
3529 styles = {}
3530 default_style = {}
3531
3532 class TTMLPElementParser:
3533 _out = ''
3534 _unclosed_elements = []
3535 _applied_styles = []
3536
3537 def start(self, tag, attrib):
3538 if tag in (_x('ttml:br'), 'br'):
3539 self._out += '\n'
3540 else:
3541 unclosed_elements = []
3542 style = {}
3543 element_style_id = attrib.get('style')
3544 if default_style:
3545 style.update(default_style)
3546 if element_style_id:
3547 style.update(styles.get(element_style_id, {}))
3548 for prop in SUPPORTED_STYLING:
3549 prop_val = attrib.get(_x('tts:' + prop))
3550 if prop_val:
3551 style[prop] = prop_val
3552 if style:
3553 font = ''
3554 for k, v in sorted(style.items()):
3555 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3556 continue
3557 if k == 'color':
3558 font += ' color="%s"' % v
3559 elif k == 'fontSize':
3560 font += ' size="%s"' % v
3561 elif k == 'fontFamily':
3562 font += ' face="%s"' % v
3563 elif k == 'fontWeight' and v == 'bold':
3564 self._out += '<b>'
3565 unclosed_elements.append('b')
3566 elif k == 'fontStyle' and v == 'italic':
3567 self._out += '<i>'
3568 unclosed_elements.append('i')
3569 elif k == 'textDecoration' and v == 'underline':
3570 self._out += '<u>'
3571 unclosed_elements.append('u')
3572 if font:
3573 self._out += '<font' + font + '>'
3574 unclosed_elements.append('font')
3575 applied_style = {}
3576 if self._applied_styles:
3577 applied_style.update(self._applied_styles[-1])
3578 applied_style.update(style)
3579 self._applied_styles.append(applied_style)
3580 self._unclosed_elements.append(unclosed_elements)
3581
3582 def end(self, tag):
3583 if tag not in (_x('ttml:br'), 'br'):
3584 unclosed_elements = self._unclosed_elements.pop()
3585 for element in reversed(unclosed_elements):
3586 self._out += '</%s>' % element
3587 if unclosed_elements and self._applied_styles:
3588 self._applied_styles.pop()
3589
3590 def data(self, data):
3591 self._out += data
3592
3593 def close(self):
3594 return self._out.strip()
3595
3596 def parse_node(node):
3597 target = TTMLPElementParser()
3598 parser = xml.etree.ElementTree.XMLParser(target=target)
3599 parser.feed(xml.etree.ElementTree.tostring(node))
3600 return parser.close()
3601
3602 for k, v in LEGACY_NAMESPACES:
3603 for ns in v:
3604 dfxp_data = dfxp_data.replace(ns, k)
3605
3606 dfxp = compat_etree_fromstring(dfxp_data)
3607 out = []
3608 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3609
3610 if not paras:
3611 raise ValueError('Invalid dfxp/TTML subtitle')
3612
3613 repeat = False
3614 while True:
3615 for style in dfxp.findall(_x('.//ttml:style')):
3616 style_id = style.get('id') or style.get(_x('xml:id'))
3617 if not style_id:
3618 continue
3619 parent_style_id = style.get('style')
3620 if parent_style_id:
3621 if parent_style_id not in styles:
3622 repeat = True
3623 continue
3624 styles[style_id] = styles[parent_style_id].copy()
3625 for prop in SUPPORTED_STYLING:
3626 prop_val = style.get(_x('tts:' + prop))
3627 if prop_val:
3628 styles.setdefault(style_id, {})[prop] = prop_val
3629 if repeat:
3630 repeat = False
3631 else:
3632 break
3633
3634 for p in ('body', 'div'):
3635 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3636 if ele is None:
3637 continue
3638 style = styles.get(ele.get('style'))
3639 if not style:
3640 continue
3641 default_style.update(style)
3642
3643 for para, index in zip(paras, itertools.count(1)):
3644 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3645 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3646 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3647 if begin_time is None:
3648 continue
3649 if not end_time:
3650 if not dur:
3651 continue
3652 end_time = begin_time + dur
3653 out.append('%d\n%s --> %s\n%s\n\n' % (
3654 index,
3655 srt_subtitles_timecode(begin_time),
3656 srt_subtitles_timecode(end_time),
3657 parse_node(para)))
3658
3659 return ''.join(out)
3660
3661
3662 def cli_option(params, command_option, param):
3663 param = params.get(param)
3664 if param:
3665 param = compat_str(param)
3666 return [command_option, param] if param is not None else []
3667
3668
3669 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3670 param = params.get(param)
3671 if param is None:
3672 return []
3673 assert isinstance(param, bool)
3674 if separator:
3675 return [command_option + separator + (true_value if param else false_value)]
3676 return [command_option, true_value if param else false_value]
3677
3678
3679 def cli_valueless_option(params, command_option, param, expected_value=True):
3680 param = params.get(param)
3681 return [command_option] if param == expected_value else []
3682
3683
3684 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3685 if isinstance(argdict, (list, tuple)): # for backward compatibility
3686 if use_compat:
3687 return argdict
3688 else:
3689 argdict = None
3690 if argdict is None:
3691 return default
3692 assert isinstance(argdict, dict)
3693
3694 assert isinstance(keys, (list, tuple))
3695 for key_list in keys:
3696 arg_list = list(filter(
3697 lambda x: x is not None,
3698 [argdict.get(key.lower()) for key in variadic(key_list)]))
3699 if arg_list:
3700 return [arg for args in arg_list for arg in args]
3701 return default
3702
3703
3704 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3705 main_key, exe = main_key.lower(), exe.lower()
3706 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3707 keys = [f'{root_key}{k}' for k in (keys or [''])]
3708 if root_key in keys:
3709 if main_key != exe:
3710 keys.append((main_key, exe))
3711 keys.append('default')
3712 else:
3713 use_compat = False
3714 return cli_configuration_args(argdict, keys, default, use_compat)
3715
3716
3717 class ISO639Utils:
3718 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3719 _lang_map = {
3720 'aa': 'aar',
3721 'ab': 'abk',
3722 'ae': 'ave',
3723 'af': 'afr',
3724 'ak': 'aka',
3725 'am': 'amh',
3726 'an': 'arg',
3727 'ar': 'ara',
3728 'as': 'asm',
3729 'av': 'ava',
3730 'ay': 'aym',
3731 'az': 'aze',
3732 'ba': 'bak',
3733 'be': 'bel',
3734 'bg': 'bul',
3735 'bh': 'bih',
3736 'bi': 'bis',
3737 'bm': 'bam',
3738 'bn': 'ben',
3739 'bo': 'bod',
3740 'br': 'bre',
3741 'bs': 'bos',
3742 'ca': 'cat',
3743 'ce': 'che',
3744 'ch': 'cha',
3745 'co': 'cos',
3746 'cr': 'cre',
3747 'cs': 'ces',
3748 'cu': 'chu',
3749 'cv': 'chv',
3750 'cy': 'cym',
3751 'da': 'dan',
3752 'de': 'deu',
3753 'dv': 'div',
3754 'dz': 'dzo',
3755 'ee': 'ewe',
3756 'el': 'ell',
3757 'en': 'eng',
3758 'eo': 'epo',
3759 'es': 'spa',
3760 'et': 'est',
3761 'eu': 'eus',
3762 'fa': 'fas',
3763 'ff': 'ful',
3764 'fi': 'fin',
3765 'fj': 'fij',
3766 'fo': 'fao',
3767 'fr': 'fra',
3768 'fy': 'fry',
3769 'ga': 'gle',
3770 'gd': 'gla',
3771 'gl': 'glg',
3772 'gn': 'grn',
3773 'gu': 'guj',
3774 'gv': 'glv',
3775 'ha': 'hau',
3776 'he': 'heb',
3777 'iw': 'heb', # Replaced by he in 1989 revision
3778 'hi': 'hin',
3779 'ho': 'hmo',
3780 'hr': 'hrv',
3781 'ht': 'hat',
3782 'hu': 'hun',
3783 'hy': 'hye',
3784 'hz': 'her',
3785 'ia': 'ina',
3786 'id': 'ind',
3787 'in': 'ind', # Replaced by id in 1989 revision
3788 'ie': 'ile',
3789 'ig': 'ibo',
3790 'ii': 'iii',
3791 'ik': 'ipk',
3792 'io': 'ido',
3793 'is': 'isl',
3794 'it': 'ita',
3795 'iu': 'iku',
3796 'ja': 'jpn',
3797 'jv': 'jav',
3798 'ka': 'kat',
3799 'kg': 'kon',
3800 'ki': 'kik',
3801 'kj': 'kua',
3802 'kk': 'kaz',
3803 'kl': 'kal',
3804 'km': 'khm',
3805 'kn': 'kan',
3806 'ko': 'kor',
3807 'kr': 'kau',
3808 'ks': 'kas',
3809 'ku': 'kur',
3810 'kv': 'kom',
3811 'kw': 'cor',
3812 'ky': 'kir',
3813 'la': 'lat',
3814 'lb': 'ltz',
3815 'lg': 'lug',
3816 'li': 'lim',
3817 'ln': 'lin',
3818 'lo': 'lao',
3819 'lt': 'lit',
3820 'lu': 'lub',
3821 'lv': 'lav',
3822 'mg': 'mlg',
3823 'mh': 'mah',
3824 'mi': 'mri',
3825 'mk': 'mkd',
3826 'ml': 'mal',
3827 'mn': 'mon',
3828 'mr': 'mar',
3829 'ms': 'msa',
3830 'mt': 'mlt',
3831 'my': 'mya',
3832 'na': 'nau',
3833 'nb': 'nob',
3834 'nd': 'nde',
3835 'ne': 'nep',
3836 'ng': 'ndo',
3837 'nl': 'nld',
3838 'nn': 'nno',
3839 'no': 'nor',
3840 'nr': 'nbl',
3841 'nv': 'nav',
3842 'ny': 'nya',
3843 'oc': 'oci',
3844 'oj': 'oji',
3845 'om': 'orm',
3846 'or': 'ori',
3847 'os': 'oss',
3848 'pa': 'pan',
3849 'pi': 'pli',
3850 'pl': 'pol',
3851 'ps': 'pus',
3852 'pt': 'por',
3853 'qu': 'que',
3854 'rm': 'roh',
3855 'rn': 'run',
3856 'ro': 'ron',
3857 'ru': 'rus',
3858 'rw': 'kin',
3859 'sa': 'san',
3860 'sc': 'srd',
3861 'sd': 'snd',
3862 'se': 'sme',
3863 'sg': 'sag',
3864 'si': 'sin',
3865 'sk': 'slk',
3866 'sl': 'slv',
3867 'sm': 'smo',
3868 'sn': 'sna',
3869 'so': 'som',
3870 'sq': 'sqi',
3871 'sr': 'srp',
3872 'ss': 'ssw',
3873 'st': 'sot',
3874 'su': 'sun',
3875 'sv': 'swe',
3876 'sw': 'swa',
3877 'ta': 'tam',
3878 'te': 'tel',
3879 'tg': 'tgk',
3880 'th': 'tha',
3881 'ti': 'tir',
3882 'tk': 'tuk',
3883 'tl': 'tgl',
3884 'tn': 'tsn',
3885 'to': 'ton',
3886 'tr': 'tur',
3887 'ts': 'tso',
3888 'tt': 'tat',
3889 'tw': 'twi',
3890 'ty': 'tah',
3891 'ug': 'uig',
3892 'uk': 'ukr',
3893 'ur': 'urd',
3894 'uz': 'uzb',
3895 've': 'ven',
3896 'vi': 'vie',
3897 'vo': 'vol',
3898 'wa': 'wln',
3899 'wo': 'wol',
3900 'xh': 'xho',
3901 'yi': 'yid',
3902 'ji': 'yid', # Replaced by yi in 1989 revision
3903 'yo': 'yor',
3904 'za': 'zha',
3905 'zh': 'zho',
3906 'zu': 'zul',
3907 }
3908
3909 @classmethod
3910 def short2long(cls, code):
3911 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3912 return cls._lang_map.get(code[:2])
3913
3914 @classmethod
3915 def long2short(cls, code):
3916 """Convert language code from ISO 639-2/T to ISO 639-1"""
3917 for short_name, long_name in cls._lang_map.items():
3918 if long_name == code:
3919 return short_name
3920
3921
3922 class ISO3166Utils:
3923 # From http://data.okfn.org/data/core/country-list
3924 _country_map = {
3925 'AF': 'Afghanistan',
3926 'AX': 'Åland Islands',
3927 'AL': 'Albania',
3928 'DZ': 'Algeria',
3929 'AS': 'American Samoa',
3930 'AD': 'Andorra',
3931 'AO': 'Angola',
3932 'AI': 'Anguilla',
3933 'AQ': 'Antarctica',
3934 'AG': 'Antigua and Barbuda',
3935 'AR': 'Argentina',
3936 'AM': 'Armenia',
3937 'AW': 'Aruba',
3938 'AU': 'Australia',
3939 'AT': 'Austria',
3940 'AZ': 'Azerbaijan',
3941 'BS': 'Bahamas',
3942 'BH': 'Bahrain',
3943 'BD': 'Bangladesh',
3944 'BB': 'Barbados',
3945 'BY': 'Belarus',
3946 'BE': 'Belgium',
3947 'BZ': 'Belize',
3948 'BJ': 'Benin',
3949 'BM': 'Bermuda',
3950 'BT': 'Bhutan',
3951 'BO': 'Bolivia, Plurinational State of',
3952 'BQ': 'Bonaire, Sint Eustatius and Saba',
3953 'BA': 'Bosnia and Herzegovina',
3954 'BW': 'Botswana',
3955 'BV': 'Bouvet Island',
3956 'BR': 'Brazil',
3957 'IO': 'British Indian Ocean Territory',
3958 'BN': 'Brunei Darussalam',
3959 'BG': 'Bulgaria',
3960 'BF': 'Burkina Faso',
3961 'BI': 'Burundi',
3962 'KH': 'Cambodia',
3963 'CM': 'Cameroon',
3964 'CA': 'Canada',
3965 'CV': 'Cape Verde',
3966 'KY': 'Cayman Islands',
3967 'CF': 'Central African Republic',
3968 'TD': 'Chad',
3969 'CL': 'Chile',
3970 'CN': 'China',
3971 'CX': 'Christmas Island',
3972 'CC': 'Cocos (Keeling) Islands',
3973 'CO': 'Colombia',
3974 'KM': 'Comoros',
3975 'CG': 'Congo',
3976 'CD': 'Congo, the Democratic Republic of the',
3977 'CK': 'Cook Islands',
3978 'CR': 'Costa Rica',
3979 'CI': 'Côte d\'Ivoire',
3980 'HR': 'Croatia',
3981 'CU': 'Cuba',
3982 'CW': 'Curaçao',
3983 'CY': 'Cyprus',
3984 'CZ': 'Czech Republic',
3985 'DK': 'Denmark',
3986 'DJ': 'Djibouti',
3987 'DM': 'Dominica',
3988 'DO': 'Dominican Republic',
3989 'EC': 'Ecuador',
3990 'EG': 'Egypt',
3991 'SV': 'El Salvador',
3992 'GQ': 'Equatorial Guinea',
3993 'ER': 'Eritrea',
3994 'EE': 'Estonia',
3995 'ET': 'Ethiopia',
3996 'FK': 'Falkland Islands (Malvinas)',
3997 'FO': 'Faroe Islands',
3998 'FJ': 'Fiji',
3999 'FI': 'Finland',
4000 'FR': 'France',
4001 'GF': 'French Guiana',
4002 'PF': 'French Polynesia',
4003 'TF': 'French Southern Territories',
4004 'GA': 'Gabon',
4005 'GM': 'Gambia',
4006 'GE': 'Georgia',
4007 'DE': 'Germany',
4008 'GH': 'Ghana',
4009 'GI': 'Gibraltar',
4010 'GR': 'Greece',
4011 'GL': 'Greenland',
4012 'GD': 'Grenada',
4013 'GP': 'Guadeloupe',
4014 'GU': 'Guam',
4015 'GT': 'Guatemala',
4016 'GG': 'Guernsey',
4017 'GN': 'Guinea',
4018 'GW': 'Guinea-Bissau',
4019 'GY': 'Guyana',
4020 'HT': 'Haiti',
4021 'HM': 'Heard Island and McDonald Islands',
4022 'VA': 'Holy See (Vatican City State)',
4023 'HN': 'Honduras',
4024 'HK': 'Hong Kong',
4025 'HU': 'Hungary',
4026 'IS': 'Iceland',
4027 'IN': 'India',
4028 'ID': 'Indonesia',
4029 'IR': 'Iran, Islamic Republic of',
4030 'IQ': 'Iraq',
4031 'IE': 'Ireland',
4032 'IM': 'Isle of Man',
4033 'IL': 'Israel',
4034 'IT': 'Italy',
4035 'JM': 'Jamaica',
4036 'JP': 'Japan',
4037 'JE': 'Jersey',
4038 'JO': 'Jordan',
4039 'KZ': 'Kazakhstan',
4040 'KE': 'Kenya',
4041 'KI': 'Kiribati',
4042 'KP': 'Korea, Democratic People\'s Republic of',
4043 'KR': 'Korea, Republic of',
4044 'KW': 'Kuwait',
4045 'KG': 'Kyrgyzstan',
4046 'LA': 'Lao People\'s Democratic Republic',
4047 'LV': 'Latvia',
4048 'LB': 'Lebanon',
4049 'LS': 'Lesotho',
4050 'LR': 'Liberia',
4051 'LY': 'Libya',
4052 'LI': 'Liechtenstein',
4053 'LT': 'Lithuania',
4054 'LU': 'Luxembourg',
4055 'MO': 'Macao',
4056 'MK': 'Macedonia, the Former Yugoslav Republic of',
4057 'MG': 'Madagascar',
4058 'MW': 'Malawi',
4059 'MY': 'Malaysia',
4060 'MV': 'Maldives',
4061 'ML': 'Mali',
4062 'MT': 'Malta',
4063 'MH': 'Marshall Islands',
4064 'MQ': 'Martinique',
4065 'MR': 'Mauritania',
4066 'MU': 'Mauritius',
4067 'YT': 'Mayotte',
4068 'MX': 'Mexico',
4069 'FM': 'Micronesia, Federated States of',
4070 'MD': 'Moldova, Republic of',
4071 'MC': 'Monaco',
4072 'MN': 'Mongolia',
4073 'ME': 'Montenegro',
4074 'MS': 'Montserrat',
4075 'MA': 'Morocco',
4076 'MZ': 'Mozambique',
4077 'MM': 'Myanmar',
4078 'NA': 'Namibia',
4079 'NR': 'Nauru',
4080 'NP': 'Nepal',
4081 'NL': 'Netherlands',
4082 'NC': 'New Caledonia',
4083 'NZ': 'New Zealand',
4084 'NI': 'Nicaragua',
4085 'NE': 'Niger',
4086 'NG': 'Nigeria',
4087 'NU': 'Niue',
4088 'NF': 'Norfolk Island',
4089 'MP': 'Northern Mariana Islands',
4090 'NO': 'Norway',
4091 'OM': 'Oman',
4092 'PK': 'Pakistan',
4093 'PW': 'Palau',
4094 'PS': 'Palestine, State of',
4095 'PA': 'Panama',
4096 'PG': 'Papua New Guinea',
4097 'PY': 'Paraguay',
4098 'PE': 'Peru',
4099 'PH': 'Philippines',
4100 'PN': 'Pitcairn',
4101 'PL': 'Poland',
4102 'PT': 'Portugal',
4103 'PR': 'Puerto Rico',
4104 'QA': 'Qatar',
4105 'RE': 'Réunion',
4106 'RO': 'Romania',
4107 'RU': 'Russian Federation',
4108 'RW': 'Rwanda',
4109 'BL': 'Saint Barthélemy',
4110 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4111 'KN': 'Saint Kitts and Nevis',
4112 'LC': 'Saint Lucia',
4113 'MF': 'Saint Martin (French part)',
4114 'PM': 'Saint Pierre and Miquelon',
4115 'VC': 'Saint Vincent and the Grenadines',
4116 'WS': 'Samoa',
4117 'SM': 'San Marino',
4118 'ST': 'Sao Tome and Principe',
4119 'SA': 'Saudi Arabia',
4120 'SN': 'Senegal',
4121 'RS': 'Serbia',
4122 'SC': 'Seychelles',
4123 'SL': 'Sierra Leone',
4124 'SG': 'Singapore',
4125 'SX': 'Sint Maarten (Dutch part)',
4126 'SK': 'Slovakia',
4127 'SI': 'Slovenia',
4128 'SB': 'Solomon Islands',
4129 'SO': 'Somalia',
4130 'ZA': 'South Africa',
4131 'GS': 'South Georgia and the South Sandwich Islands',
4132 'SS': 'South Sudan',
4133 'ES': 'Spain',
4134 'LK': 'Sri Lanka',
4135 'SD': 'Sudan',
4136 'SR': 'Suriname',
4137 'SJ': 'Svalbard and Jan Mayen',
4138 'SZ': 'Swaziland',
4139 'SE': 'Sweden',
4140 'CH': 'Switzerland',
4141 'SY': 'Syrian Arab Republic',
4142 'TW': 'Taiwan, Province of China',
4143 'TJ': 'Tajikistan',
4144 'TZ': 'Tanzania, United Republic of',
4145 'TH': 'Thailand',
4146 'TL': 'Timor-Leste',
4147 'TG': 'Togo',
4148 'TK': 'Tokelau',
4149 'TO': 'Tonga',
4150 'TT': 'Trinidad and Tobago',
4151 'TN': 'Tunisia',
4152 'TR': 'Turkey',
4153 'TM': 'Turkmenistan',
4154 'TC': 'Turks and Caicos Islands',
4155 'TV': 'Tuvalu',
4156 'UG': 'Uganda',
4157 'UA': 'Ukraine',
4158 'AE': 'United Arab Emirates',
4159 'GB': 'United Kingdom',
4160 'US': 'United States',
4161 'UM': 'United States Minor Outlying Islands',
4162 'UY': 'Uruguay',
4163 'UZ': 'Uzbekistan',
4164 'VU': 'Vanuatu',
4165 'VE': 'Venezuela, Bolivarian Republic of',
4166 'VN': 'Viet Nam',
4167 'VG': 'Virgin Islands, British',
4168 'VI': 'Virgin Islands, U.S.',
4169 'WF': 'Wallis and Futuna',
4170 'EH': 'Western Sahara',
4171 'YE': 'Yemen',
4172 'ZM': 'Zambia',
4173 'ZW': 'Zimbabwe',
4174 }
4175
4176 @classmethod
4177 def short2full(cls, code):
4178 """Convert an ISO 3166-2 country code to the corresponding full name"""
4179 return cls._country_map.get(code.upper())
4180
4181
4182 class GeoUtils:
4183 # Major IPv4 address blocks per country
4184 _country_ip_map = {
4185 'AD': '46.172.224.0/19',
4186 'AE': '94.200.0.0/13',
4187 'AF': '149.54.0.0/17',
4188 'AG': '209.59.64.0/18',
4189 'AI': '204.14.248.0/21',
4190 'AL': '46.99.0.0/16',
4191 'AM': '46.70.0.0/15',
4192 'AO': '105.168.0.0/13',
4193 'AP': '182.50.184.0/21',
4194 'AQ': '23.154.160.0/24',
4195 'AR': '181.0.0.0/12',
4196 'AS': '202.70.112.0/20',
4197 'AT': '77.116.0.0/14',
4198 'AU': '1.128.0.0/11',
4199 'AW': '181.41.0.0/18',
4200 'AX': '185.217.4.0/22',
4201 'AZ': '5.197.0.0/16',
4202 'BA': '31.176.128.0/17',
4203 'BB': '65.48.128.0/17',
4204 'BD': '114.130.0.0/16',
4205 'BE': '57.0.0.0/8',
4206 'BF': '102.178.0.0/15',
4207 'BG': '95.42.0.0/15',
4208 'BH': '37.131.0.0/17',
4209 'BI': '154.117.192.0/18',
4210 'BJ': '137.255.0.0/16',
4211 'BL': '185.212.72.0/23',
4212 'BM': '196.12.64.0/18',
4213 'BN': '156.31.0.0/16',
4214 'BO': '161.56.0.0/16',
4215 'BQ': '161.0.80.0/20',
4216 'BR': '191.128.0.0/12',
4217 'BS': '24.51.64.0/18',
4218 'BT': '119.2.96.0/19',
4219 'BW': '168.167.0.0/16',
4220 'BY': '178.120.0.0/13',
4221 'BZ': '179.42.192.0/18',
4222 'CA': '99.224.0.0/11',
4223 'CD': '41.243.0.0/16',
4224 'CF': '197.242.176.0/21',
4225 'CG': '160.113.0.0/16',
4226 'CH': '85.0.0.0/13',
4227 'CI': '102.136.0.0/14',
4228 'CK': '202.65.32.0/19',
4229 'CL': '152.172.0.0/14',
4230 'CM': '102.244.0.0/14',
4231 'CN': '36.128.0.0/10',
4232 'CO': '181.240.0.0/12',
4233 'CR': '201.192.0.0/12',
4234 'CU': '152.206.0.0/15',
4235 'CV': '165.90.96.0/19',
4236 'CW': '190.88.128.0/17',
4237 'CY': '31.153.0.0/16',
4238 'CZ': '88.100.0.0/14',
4239 'DE': '53.0.0.0/8',
4240 'DJ': '197.241.0.0/17',
4241 'DK': '87.48.0.0/12',
4242 'DM': '192.243.48.0/20',
4243 'DO': '152.166.0.0/15',
4244 'DZ': '41.96.0.0/12',
4245 'EC': '186.68.0.0/15',
4246 'EE': '90.190.0.0/15',
4247 'EG': '156.160.0.0/11',
4248 'ER': '196.200.96.0/20',
4249 'ES': '88.0.0.0/11',
4250 'ET': '196.188.0.0/14',
4251 'EU': '2.16.0.0/13',
4252 'FI': '91.152.0.0/13',
4253 'FJ': '144.120.0.0/16',
4254 'FK': '80.73.208.0/21',
4255 'FM': '119.252.112.0/20',
4256 'FO': '88.85.32.0/19',
4257 'FR': '90.0.0.0/9',
4258 'GA': '41.158.0.0/15',
4259 'GB': '25.0.0.0/8',
4260 'GD': '74.122.88.0/21',
4261 'GE': '31.146.0.0/16',
4262 'GF': '161.22.64.0/18',
4263 'GG': '62.68.160.0/19',
4264 'GH': '154.160.0.0/12',
4265 'GI': '95.164.0.0/16',
4266 'GL': '88.83.0.0/19',
4267 'GM': '160.182.0.0/15',
4268 'GN': '197.149.192.0/18',
4269 'GP': '104.250.0.0/19',
4270 'GQ': '105.235.224.0/20',
4271 'GR': '94.64.0.0/13',
4272 'GT': '168.234.0.0/16',
4273 'GU': '168.123.0.0/16',
4274 'GW': '197.214.80.0/20',
4275 'GY': '181.41.64.0/18',
4276 'HK': '113.252.0.0/14',
4277 'HN': '181.210.0.0/16',
4278 'HR': '93.136.0.0/13',
4279 'HT': '148.102.128.0/17',
4280 'HU': '84.0.0.0/14',
4281 'ID': '39.192.0.0/10',
4282 'IE': '87.32.0.0/12',
4283 'IL': '79.176.0.0/13',
4284 'IM': '5.62.80.0/20',
4285 'IN': '117.192.0.0/10',
4286 'IO': '203.83.48.0/21',
4287 'IQ': '37.236.0.0/14',
4288 'IR': '2.176.0.0/12',
4289 'IS': '82.221.0.0/16',
4290 'IT': '79.0.0.0/10',
4291 'JE': '87.244.64.0/18',
4292 'JM': '72.27.0.0/17',
4293 'JO': '176.29.0.0/16',
4294 'JP': '133.0.0.0/8',
4295 'KE': '105.48.0.0/12',
4296 'KG': '158.181.128.0/17',
4297 'KH': '36.37.128.0/17',
4298 'KI': '103.25.140.0/22',
4299 'KM': '197.255.224.0/20',
4300 'KN': '198.167.192.0/19',
4301 'KP': '175.45.176.0/22',
4302 'KR': '175.192.0.0/10',
4303 'KW': '37.36.0.0/14',
4304 'KY': '64.96.0.0/15',
4305 'KZ': '2.72.0.0/13',
4306 'LA': '115.84.64.0/18',
4307 'LB': '178.135.0.0/16',
4308 'LC': '24.92.144.0/20',
4309 'LI': '82.117.0.0/19',
4310 'LK': '112.134.0.0/15',
4311 'LR': '102.183.0.0/16',
4312 'LS': '129.232.0.0/17',
4313 'LT': '78.56.0.0/13',
4314 'LU': '188.42.0.0/16',
4315 'LV': '46.109.0.0/16',
4316 'LY': '41.252.0.0/14',
4317 'MA': '105.128.0.0/11',
4318 'MC': '88.209.64.0/18',
4319 'MD': '37.246.0.0/16',
4320 'ME': '178.175.0.0/17',
4321 'MF': '74.112.232.0/21',
4322 'MG': '154.126.0.0/17',
4323 'MH': '117.103.88.0/21',
4324 'MK': '77.28.0.0/15',
4325 'ML': '154.118.128.0/18',
4326 'MM': '37.111.0.0/17',
4327 'MN': '49.0.128.0/17',
4328 'MO': '60.246.0.0/16',
4329 'MP': '202.88.64.0/20',
4330 'MQ': '109.203.224.0/19',
4331 'MR': '41.188.64.0/18',
4332 'MS': '208.90.112.0/22',
4333 'MT': '46.11.0.0/16',
4334 'MU': '105.16.0.0/12',
4335 'MV': '27.114.128.0/18',
4336 'MW': '102.70.0.0/15',
4337 'MX': '187.192.0.0/11',
4338 'MY': '175.136.0.0/13',
4339 'MZ': '197.218.0.0/15',
4340 'NA': '41.182.0.0/16',
4341 'NC': '101.101.0.0/18',
4342 'NE': '197.214.0.0/18',
4343 'NF': '203.17.240.0/22',
4344 'NG': '105.112.0.0/12',
4345 'NI': '186.76.0.0/15',
4346 'NL': '145.96.0.0/11',
4347 'NO': '84.208.0.0/13',
4348 'NP': '36.252.0.0/15',
4349 'NR': '203.98.224.0/19',
4350 'NU': '49.156.48.0/22',
4351 'NZ': '49.224.0.0/14',
4352 'OM': '5.36.0.0/15',
4353 'PA': '186.72.0.0/15',
4354 'PE': '186.160.0.0/14',
4355 'PF': '123.50.64.0/18',
4356 'PG': '124.240.192.0/19',
4357 'PH': '49.144.0.0/13',
4358 'PK': '39.32.0.0/11',
4359 'PL': '83.0.0.0/11',
4360 'PM': '70.36.0.0/20',
4361 'PR': '66.50.0.0/16',
4362 'PS': '188.161.0.0/16',
4363 'PT': '85.240.0.0/13',
4364 'PW': '202.124.224.0/20',
4365 'PY': '181.120.0.0/14',
4366 'QA': '37.210.0.0/15',
4367 'RE': '102.35.0.0/16',
4368 'RO': '79.112.0.0/13',
4369 'RS': '93.86.0.0/15',
4370 'RU': '5.136.0.0/13',
4371 'RW': '41.186.0.0/16',
4372 'SA': '188.48.0.0/13',
4373 'SB': '202.1.160.0/19',
4374 'SC': '154.192.0.0/11',
4375 'SD': '102.120.0.0/13',
4376 'SE': '78.64.0.0/12',
4377 'SG': '8.128.0.0/10',
4378 'SI': '188.196.0.0/14',
4379 'SK': '78.98.0.0/15',
4380 'SL': '102.143.0.0/17',
4381 'SM': '89.186.32.0/19',
4382 'SN': '41.82.0.0/15',
4383 'SO': '154.115.192.0/18',
4384 'SR': '186.179.128.0/17',
4385 'SS': '105.235.208.0/21',
4386 'ST': '197.159.160.0/19',
4387 'SV': '168.243.0.0/16',
4388 'SX': '190.102.0.0/20',
4389 'SY': '5.0.0.0/16',
4390 'SZ': '41.84.224.0/19',
4391 'TC': '65.255.48.0/20',
4392 'TD': '154.68.128.0/19',
4393 'TG': '196.168.0.0/14',
4394 'TH': '171.96.0.0/13',
4395 'TJ': '85.9.128.0/18',
4396 'TK': '27.96.24.0/21',
4397 'TL': '180.189.160.0/20',
4398 'TM': '95.85.96.0/19',
4399 'TN': '197.0.0.0/11',
4400 'TO': '175.176.144.0/21',
4401 'TR': '78.160.0.0/11',
4402 'TT': '186.44.0.0/15',
4403 'TV': '202.2.96.0/19',
4404 'TW': '120.96.0.0/11',
4405 'TZ': '156.156.0.0/14',
4406 'UA': '37.52.0.0/14',
4407 'UG': '102.80.0.0/13',
4408 'US': '6.0.0.0/8',
4409 'UY': '167.56.0.0/13',
4410 'UZ': '84.54.64.0/18',
4411 'VA': '212.77.0.0/19',
4412 'VC': '207.191.240.0/21',
4413 'VE': '186.88.0.0/13',
4414 'VG': '66.81.192.0/20',
4415 'VI': '146.226.0.0/16',
4416 'VN': '14.160.0.0/11',
4417 'VU': '202.80.32.0/20',
4418 'WF': '117.20.32.0/21',
4419 'WS': '202.4.32.0/19',
4420 'YE': '134.35.0.0/16',
4421 'YT': '41.242.116.0/22',
4422 'ZA': '41.0.0.0/11',
4423 'ZM': '102.144.0.0/13',
4424 'ZW': '102.177.192.0/18',
4425 }
4426
4427 @classmethod
4428 def random_ipv4(cls, code_or_block):
4429 if len(code_or_block) == 2:
4430 block = cls._country_ip_map.get(code_or_block.upper())
4431 if not block:
4432 return None
4433 else:
4434 block = code_or_block
4435 addr, preflen = block.split('/')
4436 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4437 addr_max = addr_min | (0xffffffff >> int(preflen))
4438 return compat_str(socket.inet_ntoa(
4439 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4440
4441
4442 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4443 def __init__(self, proxies=None):
4444 # Set default handlers
4445 for type in ('http', 'https'):
4446 setattr(self, '%s_open' % type,
4447 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4448 meth(r, proxy, type))
4449 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4450
4451 def proxy_open(self, req, proxy, type):
4452 req_proxy = req.headers.get('Ytdl-request-proxy')
4453 if req_proxy is not None:
4454 proxy = req_proxy
4455 del req.headers['Ytdl-request-proxy']
4456
4457 if proxy == '__noproxy__':
4458 return None # No Proxy
4459 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4460 req.add_header('Ytdl-socks-proxy', proxy)
4461 # yt-dlp's http/https handlers do wrapping the socket with socks
4462 return None
4463 return compat_urllib_request.ProxyHandler.proxy_open(
4464 self, req, proxy, type)
4465
4466
4467 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4468 # released into Public Domain
4469 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4470
4471 def long_to_bytes(n, blocksize=0):
4472 """long_to_bytes(n:long, blocksize:int) : string
4473 Convert a long integer to a byte string.
4474
4475 If optional blocksize is given and greater than zero, pad the front of the
4476 byte string with binary zeros so that the length is a multiple of
4477 blocksize.
4478 """
4479 # after much testing, this algorithm was deemed to be the fastest
4480 s = b''
4481 n = int(n)
4482 while n > 0:
4483 s = compat_struct_pack('>I', n & 0xffffffff) + s
4484 n = n >> 32
4485 # strip off leading zeros
4486 for i in range(len(s)):
4487 if s[i] != b'\000'[0]:
4488 break
4489 else:
4490 # only happens when n == 0
4491 s = b'\000'
4492 i = 0
4493 s = s[i:]
4494 # add back some pad bytes. this could be done more efficiently w.r.t. the
4495 # de-padding being done above, but sigh...
4496 if blocksize > 0 and len(s) % blocksize:
4497 s = (blocksize - len(s) % blocksize) * b'\000' + s
4498 return s
4499
4500
4501 def bytes_to_long(s):
4502 """bytes_to_long(string) : long
4503 Convert a byte string to a long integer.
4504
4505 This is (essentially) the inverse of long_to_bytes().
4506 """
4507 acc = 0
4508 length = len(s)
4509 if length % 4:
4510 extra = (4 - length % 4)
4511 s = b'\000' * extra + s
4512 length = length + extra
4513 for i in range(0, length, 4):
4514 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4515 return acc
4516
4517
4518 def ohdave_rsa_encrypt(data, exponent, modulus):
4519 '''
4520 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4521
4522 Input:
4523 data: data to encrypt, bytes-like object
4524 exponent, modulus: parameter e and N of RSA algorithm, both integer
4525 Output: hex string of encrypted data
4526
4527 Limitation: supports one block encryption only
4528 '''
4529
4530 payload = int(binascii.hexlify(data[::-1]), 16)
4531 encrypted = pow(payload, exponent, modulus)
4532 return '%x' % encrypted
4533
4534
4535 def pkcs1pad(data, length):
4536 """
4537 Padding input data with PKCS#1 scheme
4538
4539 @param {int[]} data input data
4540 @param {int} length target length
4541 @returns {int[]} padded data
4542 """
4543 if len(data) > length - 11:
4544 raise ValueError('Input data too long for PKCS#1 padding')
4545
4546 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4547 return [0, 2] + pseudo_random + [0] + data
4548
4549
4550 def encode_base_n(num, n, table=None):
4551 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4552 if not table:
4553 table = FULL_TABLE[:n]
4554
4555 if n > len(table):
4556 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4557
4558 if num == 0:
4559 return table[0]
4560
4561 ret = ''
4562 while num:
4563 ret = table[num % n] + ret
4564 num = num // n
4565 return ret
4566
4567
4568 def decode_packed_codes(code):
4569 mobj = re.search(PACKED_CODES_RE, code)
4570 obfuscated_code, base, count, symbols = mobj.groups()
4571 base = int(base)
4572 count = int(count)
4573 symbols = symbols.split('|')
4574 symbol_table = {}
4575
4576 while count:
4577 count -= 1
4578 base_n_count = encode_base_n(count, base)
4579 symbol_table[base_n_count] = symbols[count] or base_n_count
4580
4581 return re.sub(
4582 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4583 obfuscated_code)
4584
4585
4586 def caesar(s, alphabet, shift):
4587 if shift == 0:
4588 return s
4589 l = len(alphabet)
4590 return ''.join(
4591 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4592 for c in s)
4593
4594
4595 def rot47(s):
4596 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4597
4598
4599 def parse_m3u8_attributes(attrib):
4600 info = {}
4601 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4602 if val.startswith('"'):
4603 val = val[1:-1]
4604 info[key] = val
4605 return info
4606
4607
4608 def urshift(val, n):
4609 return val >> n if val >= 0 else (val + 0x100000000) >> n
4610
4611
4612 # Based on png2str() written by @gdkchan and improved by @yokrysty
4613 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4614 def decode_png(png_data):
4615 # Reference: https://www.w3.org/TR/PNG/
4616 header = png_data[8:]
4617
4618 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4619 raise OSError('Not a valid PNG file.')
4620
4621 int_map = {1: '>B', 2: '>H', 4: '>I'}
4622 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4623
4624 chunks = []
4625
4626 while header:
4627 length = unpack_integer(header[:4])
4628 header = header[4:]
4629
4630 chunk_type = header[:4]
4631 header = header[4:]
4632
4633 chunk_data = header[:length]
4634 header = header[length:]
4635
4636 header = header[4:] # Skip CRC
4637
4638 chunks.append({
4639 'type': chunk_type,
4640 'length': length,
4641 'data': chunk_data
4642 })
4643
4644 ihdr = chunks[0]['data']
4645
4646 width = unpack_integer(ihdr[:4])
4647 height = unpack_integer(ihdr[4:8])
4648
4649 idat = b''
4650
4651 for chunk in chunks:
4652 if chunk['type'] == b'IDAT':
4653 idat += chunk['data']
4654
4655 if not idat:
4656 raise OSError('Unable to read PNG data.')
4657
4658 decompressed_data = bytearray(zlib.decompress(idat))
4659
4660 stride = width * 3
4661 pixels = []
4662
4663 def _get_pixel(idx):
4664 x = idx % stride
4665 y = idx // stride
4666 return pixels[y][x]
4667
4668 for y in range(height):
4669 basePos = y * (1 + stride)
4670 filter_type = decompressed_data[basePos]
4671
4672 current_row = []
4673
4674 pixels.append(current_row)
4675
4676 for x in range(stride):
4677 color = decompressed_data[1 + basePos + x]
4678 basex = y * stride + x
4679 left = 0
4680 up = 0
4681
4682 if x > 2:
4683 left = _get_pixel(basex - 3)
4684 if y > 0:
4685 up = _get_pixel(basex - stride)
4686
4687 if filter_type == 1: # Sub
4688 color = (color + left) & 0xff
4689 elif filter_type == 2: # Up
4690 color = (color + up) & 0xff
4691 elif filter_type == 3: # Average
4692 color = (color + ((left + up) >> 1)) & 0xff
4693 elif filter_type == 4: # Paeth
4694 a = left
4695 b = up
4696 c = 0
4697
4698 if x > 2 and y > 0:
4699 c = _get_pixel(basex - stride - 3)
4700
4701 p = a + b - c
4702
4703 pa = abs(p - a)
4704 pb = abs(p - b)
4705 pc = abs(p - c)
4706
4707 if pa <= pb and pa <= pc:
4708 color = (color + a) & 0xff
4709 elif pb <= pc:
4710 color = (color + b) & 0xff
4711 else:
4712 color = (color + c) & 0xff
4713
4714 current_row.append(color)
4715
4716 return width, height, pixels
4717
4718
4719 def write_xattr(path, key, value):
4720 # Windows: Write xattrs to NTFS Alternate Data Streams:
4721 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4722 if compat_os_name == 'nt':
4723 assert ':' not in key
4724 assert os.path.exists(path)
4725
4726 try:
4727 with open(f'{path}:{key}', 'wb') as f:
4728 f.write(value)
4729 except OSError as e:
4730 raise XAttrMetadataError(e.errno, e.strerror)
4731 return
4732
4733 # UNIX Method 1. Use xattrs/pyxattrs modules
4734 from .dependencies import xattr
4735
4736 setxattr = None
4737 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4738 # Unicode arguments are not supported in pyxattr until version 0.5.0
4739 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4740 if version_tuple(xattr.__version__) >= (0, 5, 0):
4741 setxattr = xattr.set
4742 elif xattr:
4743 setxattr = xattr.setxattr
4744
4745 if setxattr:
4746 try:
4747 setxattr(path, key, value)
4748 except OSError as e:
4749 raise XAttrMetadataError(e.errno, e.strerror)
4750 return
4751
4752 # UNIX Method 2. Use setfattr/xattr executables
4753 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4754 else 'xattr' if check_executable('xattr', ['-h']) else None)
4755 if not exe:
4756 raise XAttrUnavailableError(
4757 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4758 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4759
4760 value = value.decode()
4761 try:
4762 p = Popen(
4763 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4764 stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4765 except OSError as e:
4766 raise XAttrMetadataError(e.errno, e.strerror)
4767 stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace')
4768 if p.returncode:
4769 raise XAttrMetadataError(p.returncode, stderr)
4770
4771
4772 def random_birthday(year_field, month_field, day_field):
4773 start_date = datetime.date(1950, 1, 1)
4774 end_date = datetime.date(1995, 12, 31)
4775 offset = random.randint(0, (end_date - start_date).days)
4776 random_date = start_date + datetime.timedelta(offset)
4777 return {
4778 year_field: str(random_date.year),
4779 month_field: str(random_date.month),
4780 day_field: str(random_date.day),
4781 }
4782
4783
4784 # Templates for internet shortcut files, which are plain text files.
4785 DOT_URL_LINK_TEMPLATE = '''\
4786 [InternetShortcut]
4787 URL=%(url)s
4788 '''
4789
4790 DOT_WEBLOC_LINK_TEMPLATE = '''\
4791 <?xml version="1.0" encoding="UTF-8"?>
4792 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4793 <plist version="1.0">
4794 <dict>
4795 \t<key>URL</key>
4796 \t<string>%(url)s</string>
4797 </dict>
4798 </plist>
4799 '''
4800
4801 DOT_DESKTOP_LINK_TEMPLATE = '''\
4802 [Desktop Entry]
4803 Encoding=UTF-8
4804 Name=%(filename)s
4805 Type=Link
4806 URL=%(url)s
4807 Icon=text-html
4808 '''
4809
4810 LINK_TEMPLATES = {
4811 'url': DOT_URL_LINK_TEMPLATE,
4812 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4813 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4814 }
4815
4816
4817 def iri_to_uri(iri):
4818 """
4819 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4820
4821 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4822 """
4823
4824 iri_parts = compat_urllib_parse_urlparse(iri)
4825
4826 if '[' in iri_parts.netloc:
4827 raise ValueError('IPv6 URIs are not, yet, supported.')
4828 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4829
4830 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4831
4832 net_location = ''
4833 if iri_parts.username:
4834 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4835 if iri_parts.password is not None:
4836 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4837 net_location += '@'
4838
4839 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
4840 # The 'idna' encoding produces ASCII text.
4841 if iri_parts.port is not None and iri_parts.port != 80:
4842 net_location += ':' + str(iri_parts.port)
4843
4844 return urllib.parse.urlunparse(
4845 (iri_parts.scheme,
4846 net_location,
4847
4848 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4849
4850 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4851 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4852
4853 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4854 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4855
4856 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4857
4858 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4859
4860
4861 def to_high_limit_path(path):
4862 if sys.platform in ['win32', 'cygwin']:
4863 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4864 return '\\\\?\\' + os.path.abspath(path)
4865
4866 return path
4867
4868
4869 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4870 val = traverse_obj(obj, *variadic(field))
4871 if val in ignore:
4872 return default
4873 return template % (func(val) if func else val)
4874
4875
4876 def clean_podcast_url(url):
4877 return re.sub(r'''(?x)
4878 (?:
4879 (?:
4880 chtbl\.com/track|
4881 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4882 play\.podtrac\.com
4883 )/[^/]+|
4884 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4885 flex\.acast\.com|
4886 pd(?:
4887 cn\.co| # https://podcorn.com/analytics-prefix/
4888 st\.fm # https://podsights.com/docs/
4889 )/e
4890 )/''', '', url)
4891
4892
4893 _HEX_TABLE = '0123456789abcdef'
4894
4895
4896 def random_uuidv4():
4897 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4898
4899
4900 def make_dir(path, to_screen=None):
4901 try:
4902 dn = os.path.dirname(path)
4903 if dn and not os.path.exists(dn):
4904 os.makedirs(dn)
4905 return True
4906 except OSError as err:
4907 if callable(to_screen) is not None:
4908 to_screen('unable to create directory ' + error_to_compat_str(err))
4909 return False
4910
4911
4912 def get_executable_path():
4913 from zipimport import zipimporter
4914 if hasattr(sys, 'frozen'): # Running from PyInstaller
4915 path = os.path.dirname(sys.executable)
4916 elif isinstance(__loader__, zipimporter): # Running from ZIP
4917 path = os.path.join(os.path.dirname(__file__), '../..')
4918 else:
4919 path = os.path.join(os.path.dirname(__file__), '..')
4920 return os.path.abspath(path)
4921
4922
4923 def load_plugins(name, suffix, namespace):
4924 classes = {}
4925 with contextlib.suppress(FileNotFoundError):
4926 plugins_spec = importlib.util.spec_from_file_location(
4927 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4928 plugins = importlib.util.module_from_spec(plugins_spec)
4929 sys.modules[plugins_spec.name] = plugins
4930 plugins_spec.loader.exec_module(plugins)
4931 for name in dir(plugins):
4932 if name in namespace:
4933 continue
4934 if not name.endswith(suffix):
4935 continue
4936 klass = getattr(plugins, name)
4937 classes[name] = namespace[name] = klass
4938 return classes
4939
4940
4941 def traverse_obj(
4942 obj, *path_list, default=None, expected_type=None, get_all=True,
4943 casesense=True, is_user_input=False, traverse_string=False):
4944 ''' Traverse nested list/dict/tuple
4945 @param path_list A list of paths which are checked one by one.
4946 Each path is a list of keys where each key is a:
4947 - None: Do nothing
4948 - string: A dictionary key
4949 - int: An index into a list
4950 - tuple: A list of keys all of which will be traversed
4951 - Ellipsis: Fetch all values in the object
4952 - Function: Takes the key and value as arguments
4953 and returns whether the key matches or not
4954 @param default Default value to return
4955 @param expected_type Only accept final value of this type (Can also be any callable)
4956 @param get_all Return all the values obtained from a path or only the first one
4957 @param casesense Whether to consider dictionary keys as case sensitive
4958 @param is_user_input Whether the keys are generated from user input. If True,
4959 strings are converted to int/slice if necessary
4960 @param traverse_string Whether to traverse inside strings. If True, any
4961 non-compatible object will also be converted into a string
4962 # TODO: Write tests
4963 '''
4964 if not casesense:
4965 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4966 path_list = (map(_lower, variadic(path)) for path in path_list)
4967
4968 def _traverse_obj(obj, path, _current_depth=0):
4969 nonlocal depth
4970 path = tuple(variadic(path))
4971 for i, key in enumerate(path):
4972 if None in (key, obj):
4973 return obj
4974 if isinstance(key, (list, tuple)):
4975 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4976 key = ...
4977 if key is ...:
4978 obj = (obj.values() if isinstance(obj, dict)
4979 else obj if isinstance(obj, (list, tuple, LazyList))
4980 else str(obj) if traverse_string else [])
4981 _current_depth += 1
4982 depth = max(depth, _current_depth)
4983 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4984 elif callable(key):
4985 if isinstance(obj, (list, tuple, LazyList)):
4986 obj = enumerate(obj)
4987 elif isinstance(obj, dict):
4988 obj = obj.items()
4989 else:
4990 if not traverse_string:
4991 return None
4992 obj = str(obj)
4993 _current_depth += 1
4994 depth = max(depth, _current_depth)
4995 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
4996 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4997 obj = (obj.get(key) if casesense or (key in obj)
4998 else next((v for k, v in obj.items() if _lower(k) == key), None))
4999 else:
5000 if is_user_input:
5001 key = (int_or_none(key) if ':' not in key
5002 else slice(*map(int_or_none, key.split(':'))))
5003 if key == slice(None):
5004 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5005 if not isinstance(key, (int, slice)):
5006 return None
5007 if not isinstance(obj, (list, tuple, LazyList)):
5008 if not traverse_string:
5009 return None
5010 obj = str(obj)
5011 try:
5012 obj = obj[key]
5013 except IndexError:
5014 return None
5015 return obj
5016
5017 if isinstance(expected_type, type):
5018 type_test = lambda val: val if isinstance(val, expected_type) else None
5019 elif expected_type is not None:
5020 type_test = expected_type
5021 else:
5022 type_test = lambda val: val
5023
5024 for path in path_list:
5025 depth = 0
5026 val = _traverse_obj(obj, path)
5027 if val is not None:
5028 if depth:
5029 for _ in range(depth - 1):
5030 val = itertools.chain.from_iterable(v for v in val if v is not None)
5031 val = [v for v in map(type_test, val) if v is not None]
5032 if val:
5033 return val if get_all else val[0]
5034 else:
5035 val = type_test(val)
5036 if val is not None:
5037 return val
5038 return default
5039
5040
5041 def traverse_dict(dictn, keys, casesense=True):
5042 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5043 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5044 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5045
5046
5047 def get_first(obj, keys, **kwargs):
5048 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5049
5050
5051 def variadic(x, allowed_types=(str, bytes, dict)):
5052 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5053
5054
5055 def decode_base(value, digits):
5056 # This will convert given base-x string to scalar (long or int)
5057 table = {char: index for index, char in enumerate(digits)}
5058 result = 0
5059 base = len(digits)
5060 for chr in value:
5061 result *= base
5062 result += table[chr]
5063 return result
5064
5065
5066 def time_seconds(**kwargs):
5067 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5068 return t.timestamp()
5069
5070
5071 # create a JSON Web Signature (jws) with HS256 algorithm
5072 # the resulting format is in JWS Compact Serialization
5073 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5074 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5075 def jwt_encode_hs256(payload_data, key, headers={}):
5076 header_data = {
5077 'alg': 'HS256',
5078 'typ': 'JWT',
5079 }
5080 if headers:
5081 header_data.update(headers)
5082 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5083 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5084 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5085 signature_b64 = base64.b64encode(h.digest())
5086 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5087 return token
5088
5089
5090 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5091 def jwt_decode_hs256(jwt):
5092 header_b64, payload_b64, signature_b64 = jwt.split('.')
5093 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5094 return payload_data
5095
5096
5097 def supports_terminal_sequences(stream):
5098 if compat_os_name == 'nt':
5099 from .compat import WINDOWS_VT_MODE # Must be imported locally
5100 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5101 return False
5102 elif not os.getenv('TERM'):
5103 return False
5104 try:
5105 return stream.isatty()
5106 except BaseException:
5107 return False
5108
5109
5110 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5111
5112
5113 def remove_terminal_sequences(string):
5114 return _terminal_sequences_re.sub('', string)
5115
5116
5117 def number_of_digits(number):
5118 return len('%d' % number)
5119
5120
5121 def join_nonempty(*values, delim='-', from_dict=None):
5122 if from_dict is not None:
5123 values = map(from_dict.get, values)
5124 return delim.join(map(str, filter(None, values)))
5125
5126
5127 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5128 """
5129 Find the largest format dimensions in terms of video width and, for each thumbnail:
5130 * Modify the URL: Match the width with the provided regex and replace with the former width
5131 * Update dimensions
5132
5133 This function is useful with video services that scale the provided thumbnails on demand
5134 """
5135 _keys = ('width', 'height')
5136 max_dimensions = max(
5137 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5138 default=(0, 0))
5139 if not max_dimensions[0]:
5140 return thumbnails
5141 return [
5142 merge_dicts(
5143 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5144 dict(zip(_keys, max_dimensions)), thumbnail)
5145 for thumbnail in thumbnails
5146 ]
5147
5148
5149 def parse_http_range(range):
5150 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5151 if not range:
5152 return None, None, None
5153 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5154 if not crg:
5155 return None, None, None
5156 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5157
5158
5159 class Config:
5160 own_args = None
5161 filename = None
5162 __initialized = False
5163
5164 def __init__(self, parser, label=None):
5165 self._parser, self.label = parser, label
5166 self._loaded_paths, self.configs = set(), []
5167
5168 def init(self, args=None, filename=None):
5169 assert not self.__initialized
5170 directory = ''
5171 if filename:
5172 location = os.path.realpath(filename)
5173 directory = os.path.dirname(location)
5174 if location in self._loaded_paths:
5175 return False
5176 self._loaded_paths.add(location)
5177
5178 self.__initialized = True
5179 self.own_args, self.filename = args, filename
5180 for location in self._parser.parse_args(args)[0].config_locations or []:
5181 location = os.path.join(directory, expand_path(location))
5182 if os.path.isdir(location):
5183 location = os.path.join(location, 'yt-dlp.conf')
5184 if not os.path.exists(location):
5185 self._parser.error(f'config location {location} does not exist')
5186 self.append_config(self.read_file(location), location)
5187 return True
5188
5189 def __str__(self):
5190 label = join_nonempty(
5191 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5192 delim=' ')
5193 return join_nonempty(
5194 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5195 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5196 delim='\n')
5197
5198 @staticmethod
5199 def read_file(filename, default=[]):
5200 try:
5201 optionf = open(filename)
5202 except OSError:
5203 return default # silently skip if file is not present
5204 try:
5205 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5206 contents = optionf.read()
5207 res = shlex.split(contents, comments=True)
5208 finally:
5209 optionf.close()
5210 return res
5211
5212 @staticmethod
5213 def hide_login_info(opts):
5214 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5215 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5216
5217 def _scrub_eq(o):
5218 m = eqre.match(o)
5219 if m:
5220 return m.group('key') + '=PRIVATE'
5221 else:
5222 return o
5223
5224 opts = list(map(_scrub_eq, opts))
5225 for idx, opt in enumerate(opts):
5226 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5227 opts[idx + 1] = 'PRIVATE'
5228 return opts
5229
5230 def append_config(self, *args, label=None):
5231 config = type(self)(self._parser, label)
5232 config._loaded_paths = self._loaded_paths
5233 if config.init(*args):
5234 self.configs.append(config)
5235
5236 @property
5237 def all_args(self):
5238 for config in reversed(self.configs):
5239 yield from config.all_args
5240 yield from self.own_args or []
5241
5242 def parse_args(self):
5243 return self._parser.parse_args(self.all_args)
5244
5245
5246 class WebSocketsWrapper():
5247 """Wraps websockets module to use in non-async scopes"""
5248 pool = None
5249
5250 def __init__(self, url, headers=None, connect=True):
5251 self.loop = asyncio.new_event_loop()
5252 # XXX: "loop" is deprecated
5253 self.conn = websockets.connect(
5254 url, extra_headers=headers, ping_interval=None,
5255 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5256 if connect:
5257 self.__enter__()
5258 atexit.register(self.__exit__, None, None, None)
5259
5260 def __enter__(self):
5261 if not self.pool:
5262 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5263 return self
5264
5265 def send(self, *args):
5266 self.run_with_loop(self.pool.send(*args), self.loop)
5267
5268 def recv(self, *args):
5269 return self.run_with_loop(self.pool.recv(*args), self.loop)
5270
5271 def __exit__(self, type, value, traceback):
5272 try:
5273 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5274 finally:
5275 self.loop.close()
5276 self._cancel_all_tasks(self.loop)
5277
5278 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5279 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5280 @staticmethod
5281 def run_with_loop(main, loop):
5282 if not asyncio.iscoroutine(main):
5283 raise ValueError(f'a coroutine was expected, got {main!r}')
5284
5285 try:
5286 return loop.run_until_complete(main)
5287 finally:
5288 loop.run_until_complete(loop.shutdown_asyncgens())
5289 if hasattr(loop, 'shutdown_default_executor'):
5290 loop.run_until_complete(loop.shutdown_default_executor())
5291
5292 @staticmethod
5293 def _cancel_all_tasks(loop):
5294 to_cancel = asyncio.all_tasks(loop)
5295
5296 if not to_cancel:
5297 return
5298
5299 for task in to_cancel:
5300 task.cancel()
5301
5302 # XXX: "loop" is removed in python 3.10+
5303 loop.run_until_complete(
5304 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5305
5306 for task in to_cancel:
5307 if task.cancelled():
5308 continue
5309 if task.exception() is not None:
5310 loop.call_exception_handler({
5311 'message': 'unhandled exception during asyncio.run() shutdown',
5312 'exception': task.exception(),
5313 'task': task,
5314 })
5315
5316
5317 def merge_headers(*dicts):
5318 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5319 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5320
5321
5322 class classproperty:
5323 def __init__(self, f):
5324 functools.update_wrapper(self, f)
5325 self.f = f
5326
5327 def __get__(self, _, cls):
5328 return self.f(cls)
5329
5330
5331 def Namespace(**kwargs):
5332 return collections.namedtuple('Namespace', kwargs)(**kwargs)
5333
5334
5335 # Deprecated
5336 has_certifi = bool(certifi)
5337 has_websockets = bool(websockets)