]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
bcdb7d55b61488c2dda9d22307c13a9fdac3673b
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import ctypes
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import functools
15 import gzip
16 import hashlib
17 import hmac
18 import importlib.util
19 import io
20 import itertools
21 import json
22 import locale
23 import math
24 import mimetypes
25 import operator
26 import os
27 import platform
28 import random
29 import re
30 import shlex
31 import socket
32 import ssl
33 import subprocess
34 import sys
35 import tempfile
36 import time
37 import traceback
38 import urllib.parse
39 import xml.etree.ElementTree
40 import zlib
41
42 from .compat import (
43 asyncio,
44 compat_chr,
45 compat_cookiejar,
46 compat_etree_fromstring,
47 compat_expanduser,
48 compat_html_entities,
49 compat_html_entities_html5,
50 compat_HTMLParseError,
51 compat_HTMLParser,
52 compat_http_client,
53 compat_HTTPError,
54 compat_os_name,
55 compat_parse_qs,
56 compat_shlex_quote,
57 compat_str,
58 compat_struct_pack,
59 compat_struct_unpack,
60 compat_urllib_error,
61 compat_urllib_parse_unquote_plus,
62 compat_urllib_parse_urlencode,
63 compat_urllib_parse_urlparse,
64 compat_urllib_request,
65 compat_urlparse,
66 )
67 from .dependencies import brotli, certifi, websockets
68 from .socks import ProxyType, sockssocket
69
70
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
80 # This is not clearly defined otherwise
81 compiled_regex_type = type(re.compile(''))
82
83
84 def random_user_agent():
85 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
86 _CHROME_VERSIONS = (
87 '90.0.4430.212',
88 '90.0.4430.24',
89 '90.0.4430.70',
90 '90.0.4430.72',
91 '90.0.4430.85',
92 '90.0.4430.93',
93 '91.0.4472.101',
94 '91.0.4472.106',
95 '91.0.4472.114',
96 '91.0.4472.124',
97 '91.0.4472.164',
98 '91.0.4472.19',
99 '91.0.4472.77',
100 '92.0.4515.107',
101 '92.0.4515.115',
102 '92.0.4515.131',
103 '92.0.4515.159',
104 '92.0.4515.43',
105 '93.0.4556.0',
106 '93.0.4577.15',
107 '93.0.4577.63',
108 '93.0.4577.82',
109 '94.0.4606.41',
110 '94.0.4606.54',
111 '94.0.4606.61',
112 '94.0.4606.71',
113 '94.0.4606.81',
114 '94.0.4606.85',
115 '95.0.4638.17',
116 '95.0.4638.50',
117 '95.0.4638.54',
118 '95.0.4638.69',
119 '95.0.4638.74',
120 '96.0.4664.18',
121 '96.0.4664.45',
122 '96.0.4664.55',
123 '96.0.4664.93',
124 '97.0.4692.20',
125 )
126 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
127
128
129 SUPPORTED_ENCODINGS = [
130 'gzip', 'deflate'
131 ]
132 if brotli:
133 SUPPORTED_ENCODINGS.append('br')
134
135 std_headers = {
136 'User-Agent': random_user_agent(),
137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
138 'Accept-Language': 'en-us,en;q=0.5',
139 'Sec-Fetch-Mode': 'navigate',
140 }
141
142
143 USER_AGENTS = {
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
145 }
146
147
148 NO_DEFAULT = object()
149
150 ENGLISH_MONTH_NAMES = [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
153
154 MONTH_NAMES = {
155 'en': ENGLISH_MONTH_NAMES,
156 'fr': [
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
159 }
160
161 KNOWN_EXTENSIONS = (
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
166 'avi', 'divx',
167 'mov',
168 'asf', 'wmv', 'wma',
169 '3gp', '3g2',
170 'mp3',
171 'flac',
172 'ape',
173 'wav',
174 'f4f', 'f4m', 'm3u8', 'smil')
175
176 # needed for sanitizing filenames in restricted mode
177 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
178 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
180
181 DATE_FORMATS = (
182 '%d %B %Y',
183 '%d %b %Y',
184 '%B %d %Y',
185 '%B %dst %Y',
186 '%B %dnd %Y',
187 '%B %drd %Y',
188 '%B %dth %Y',
189 '%b %d %Y',
190 '%b %dst %Y',
191 '%b %dnd %Y',
192 '%b %drd %Y',
193 '%b %dth %Y',
194 '%b %dst %Y %I:%M',
195 '%b %dnd %Y %I:%M',
196 '%b %drd %Y %I:%M',
197 '%b %dth %Y %I:%M',
198 '%Y %m %d',
199 '%Y-%m-%d',
200 '%Y.%m.%d.',
201 '%Y/%m/%d',
202 '%Y/%m/%d %H:%M',
203 '%Y/%m/%d %H:%M:%S',
204 '%Y%m%d%H%M',
205 '%Y%m%d%H%M%S',
206 '%Y%m%d',
207 '%Y-%m-%d %H:%M',
208 '%Y-%m-%d %H:%M:%S',
209 '%Y-%m-%d %H:%M:%S.%f',
210 '%Y-%m-%d %H:%M:%S:%f',
211 '%d.%m.%Y %H:%M',
212 '%d.%m.%Y %H.%M',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S',
217 '%Y-%m-%dT%H:%M:%S.%f',
218 '%Y-%m-%dT%H:%M',
219 '%b %d %Y at %H:%M',
220 '%b %d %Y at %H:%M:%S',
221 '%B %d %Y at %H:%M',
222 '%B %d %Y at %H:%M:%S',
223 '%H:%M %d-%b-%Y',
224 )
225
226 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
227 DATE_FORMATS_DAY_FIRST.extend([
228 '%d-%m-%Y',
229 '%d.%m.%Y',
230 '%d.%m.%y',
231 '%d/%m/%Y',
232 '%d/%m/%y',
233 '%d/%m/%Y %H:%M:%S',
234 ])
235
236 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
237 DATE_FORMATS_MONTH_FIRST.extend([
238 '%m-%d-%Y',
239 '%m.%d.%Y',
240 '%m/%d/%Y',
241 '%m/%d/%y',
242 '%m/%d/%Y %H:%M:%S',
243 ])
244
245 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
246 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
247
248 NUMBER_RE = r'\d+(?:\.\d+)?'
249
250
251 def preferredencoding():
252 """Get preferred encoding.
253
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
256 """
257 try:
258 pref = locale.getpreferredencoding()
259 'TEST'.encode(pref)
260 except Exception:
261 pref = 'UTF-8'
262
263 return pref
264
265
266 def write_json_file(obj, fn):
267 """ Encode obj as JSON and write it to fn, atomically if possible """
268
269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
271 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
272
273 try:
274 with tf:
275 json.dump(obj, tf, ensure_ascii=False)
276 if sys.platform == 'win32':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
279 with contextlib.suppress(OSError):
280 os.unlink(fn)
281 with contextlib.suppress(OSError):
282 mask = os.umask(0)
283 os.umask(mask)
284 os.chmod(tf.name, 0o666 & ~mask)
285 os.rename(tf.name, fn)
286 except Exception:
287 with contextlib.suppress(OSError):
288 os.remove(tf.name)
289 raise
290
291
292 def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^[a-zA-Z_-]+$', key)
295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
296 return node.find(expr)
297
298 # On python2.6 the xml.etree.ElementTree.Element methods don't support
299 # the namespace parameter
300
301
302 def xpath_with_ns(path, ns_map):
303 components = [c.split(':') for c in path.split('/')]
304 replaced = []
305 for c in components:
306 if len(c) == 1:
307 replaced.append(c[0])
308 else:
309 ns, tag = c
310 replaced.append('{%s}%s' % (ns_map[ns], tag))
311 return '/'.join(replaced)
312
313
314 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
315 def _find_xpath(xpath):
316 return node.find(xpath)
317
318 if isinstance(xpath, (str, compat_str)):
319 n = _find_xpath(xpath)
320 else:
321 for xp in xpath:
322 n = _find_xpath(xp)
323 if n is not None:
324 break
325
326 if n is None:
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
330 name = xpath if name is None else name
331 raise ExtractorError('Could not find XML element %s' % name)
332 else:
333 return None
334 return n
335
336
337 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
338 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
339 if n is None or n == default:
340 return n
341 if n.text is None:
342 if default is not NO_DEFAULT:
343 return default
344 elif fatal:
345 name = xpath if name is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name)
347 else:
348 return None
349 return n.text
350
351
352 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
353 n = find_xpath_attr(node, xpath, key)
354 if n is None:
355 if default is not NO_DEFAULT:
356 return default
357 elif fatal:
358 name = f'{xpath}[@{key}]' if name is None else name
359 raise ExtractorError('Could not find XML attribute %s' % name)
360 else:
361 return None
362 return n.attrib[key]
363
364
365 def get_element_by_id(id, html):
366 """Return the content of the tag with the specified ID in the passed HTML document"""
367 return get_element_by_attribute('id', id, html)
368
369
370 def get_element_html_by_id(id, html):
371 """Return the html of the tag with the specified ID in the passed HTML document"""
372 return get_element_html_by_attribute('id', id, html)
373
374
375 def get_element_by_class(class_name, html):
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
381 def get_element_html_by_class(class_name, html):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval = get_elements_html_by_class(class_name, html)
384 return retval[0] if retval else None
385
386
387 def get_element_by_attribute(attribute, value, html, escape_value=True):
388 retval = get_elements_by_attribute(attribute, value, html, escape_value)
389 return retval[0] if retval else None
390
391
392 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
393 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
394 return retval[0] if retval else None
395
396
397 def get_elements_by_class(class_name, html):
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
400 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
402
403
404 def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
409
410
411 def get_elements_by_attribute(*args, **kwargs):
412 """Return the content of the tag with the specified attribute in the passed HTML document"""
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416 def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
419
420
421 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
422 """
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
425 """
426
427 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
428
429 value = re.escape(value) if escape_value else value
430
431 partial_element_re = rf'''(?x)
432 <(?P<tag>[a-zA-Z0-9:._-]+)
433 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
435 '''
436
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
439
440 yield (
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
442 whole
443 )
444
445
446 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
447 """
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
450 as a context manager
451 """
452
453 class HTMLBreakOnClosingTagException(Exception):
454 pass
455
456 def __init__(self):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
459
460 def __enter__(self):
461 return self
462
463 def __exit__(self, *_):
464 self.close()
465
466 def close(self):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
470 pass
471
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
474
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags in the stack')
478 while self.tagstack:
479 inner_tag = self.tagstack.pop()
480 if inner_tag == tag:
481 break
482 else:
483 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
486
487
488 def get_element_text_and_html_by_tag(tag, html):
489 """
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text) and the whole element (html)
492 """
493 def find_or_raise(haystack, needle, exc):
494 try:
495 return haystack.index(needle)
496 except ValueError:
497 raise exc
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
514 try:
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
521
522
523 class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes for a single element"""
525
526 def __init__(self):
527 self.attrs = {}
528 compat_HTMLParser.__init__(self)
529
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
532
533
534 class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes for the elements of a list"""
536
537 def __init__(self):
538 compat_HTMLParser.__init__(self)
539 self.items = []
540 self._level = 0
541
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
545 self._level += 1
546
547 def handle_endtag(self, tag):
548 self._level -= 1
549
550
551 def extract_attributes(html_element):
552 """Given a string for an HTML element such as
553 <el
554 a="foo" B="bar" c="&98;az" d=boz
555 empty= noval entity="&amp;"
556 sq='"' dq="'"
557 >
558 Decode and return a dictionary of attributes.
559 {
560 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
563 }.
564 """
565 parser = HTMLAttributeParser()
566 with contextlib.suppress(compat_HTMLParseError):
567 parser.feed(html_element)
568 parser.close()
569 return parser.attrs
570
571
572 def parse_list(webpage):
573 """Given a string for an series of HTML <li> elements,
574 return a dictionary of their attributes"""
575 parser = HTMLListAttrsParser()
576 parser.feed(webpage)
577 parser.close()
578 return parser.items
579
580
581 def clean_html(html):
582 """Clean an HTML snippet into a readable string"""
583
584 if html is None: # Convenience for sanitizing descriptions etc.
585 return html
586
587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
590 # Strip html tags
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
594 return html.strip()
595
596
597 def sanitize_open(filename, open_mode):
598 """Try to open the given filename, and slightly tweak it if this fails.
599
600 Attempts to open the given filename. If this fails, it tries to change
601 the filename slightly, step by step, until it's either able to open it
602 or it fails and raises a final exception, like the standard open()
603 function.
604
605 It returns the tuple (stream, definitive_file_name).
606 """
607 if filename == '-':
608 if sys.platform == 'win32':
609 import msvcrt
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
612
613 for attempt in range(2):
614 try:
615 try:
616 if sys.platform == 'win32':
617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the file on windows (for now).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
620 raise LockingUnsupportedError()
621 stream = locked_file(filename, open_mode, block=False).__enter__()
622 except LockingUnsupportedError:
623 stream = open(filename, open_mode)
624 return (stream, filename)
625 except OSError as err:
626 if attempt or err.errno in (errno.EACCES,):
627 raise
628 old_filename, filename = filename, sanitize_path(filename)
629 if old_filename == filename:
630 raise
631
632
633 def timeconvert(timestr):
634 """Convert RFC 2822 defined time string into system timestamp"""
635 timestamp = None
636 timetuple = email.utils.parsedate_tz(timestr)
637 if timetuple is not None:
638 timestamp = email.utils.mktime_tz(timetuple)
639 return timestamp
640
641
642 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
643 """Sanitizes a string so it could be used as part of a filename.
644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
647 """
648 if s == '':
649 return ''
650
651 def replace_insane(char):
652 if restricted and char in ACCENT_CHARS:
653 return ACCENT_CHARS[char]
654 elif not restricted and char == '\n':
655 return '\0 '
656 elif char == '?' or ord(char) < 32 or ord(char) == 127:
657 return ''
658 elif char == '"':
659 return '' if restricted else '\''
660 elif char == ':':
661 return '\0_\0-' if restricted else '\0 \0-'
662 elif char in '\\/|*<>':
663 return '\0_'
664 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
665 return '\0_'
666 return char
667
668 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
669 result = ''.join(map(replace_insane, s))
670 if is_id is NO_DEFAULT:
671 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
672 STRIP_RE = '(?:\0.|[ _-])*'
673 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
674 result = result.replace('\0', '') or '_'
675
676 if not is_id:
677 while '__' in result:
678 result = result.replace('__', '_')
679 result = result.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted and result.startswith('-_'):
682 result = result[2:]
683 if result.startswith('-'):
684 result = '_' + result[len('-'):]
685 result = result.lstrip('.')
686 if not result:
687 result = '_'
688 return result
689
690
691 def sanitize_path(s, force=False):
692 """Sanitizes and normalizes path on Windows"""
693 if sys.platform == 'win32':
694 force = False
695 drive_or_unc, _ = os.path.splitdrive(s)
696 elif force:
697 drive_or_unc = ''
698 else:
699 return s
700
701 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
702 if drive_or_unc:
703 norm_path.pop(0)
704 sanitized_path = [
705 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
706 for path_part in norm_path]
707 if drive_or_unc:
708 sanitized_path.insert(0, drive_or_unc + os.path.sep)
709 elif force and s and s[0] == os.path.sep:
710 sanitized_path.insert(0, os.path.sep)
711 return os.path.join(*sanitized_path)
712
713
714 def sanitize_url(url):
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
717 if url is None:
718 return
719 elif url.startswith('//'):
720 return 'http:%s' % url
721 # Fix some common typos seen so far
722 COMMON_TYPOS = (
723 # https://github.com/ytdl-org/youtube-dl/issues/15649
724 (r'^httpss://', r'https://'),
725 # https://bx1.be/lives/direct-tv/
726 (r'^rmtp([es]?)://', r'rtmp\1://'),
727 )
728 for mistake, fixup in COMMON_TYPOS:
729 if re.match(mistake, url):
730 return re.sub(mistake, fixup, url)
731 return url
732
733
734 def extract_basic_auth(url):
735 parts = compat_urlparse.urlsplit(url)
736 if parts.username is None:
737 return url, None
738 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
739 parts.hostname if parts.port is None
740 else '%s:%d' % (parts.hostname, parts.port))))
741 auth_payload = base64.b64encode(
742 ('%s:%s' % (parts.username, parts.password or '')).encode())
743 return url, f'Basic {auth_payload.decode()}'
744
745
746 def sanitized_Request(url, *args, **kwargs):
747 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
748 if auth_header is not None:
749 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
750 headers['Authorization'] = auth_header
751 return compat_urllib_request.Request(url, *args, **kwargs)
752
753
754 def expand_path(s):
755 """Expand shell variables and ~"""
756 return os.path.expandvars(compat_expanduser(s))
757
758
759 def orderedSet(iterable):
760 """ Remove all duplicates from the input iterable """
761 res = []
762 for el in iterable:
763 if el not in res:
764 res.append(el)
765 return res
766
767
768 def _htmlentity_transform(entity_with_semicolon):
769 """Transforms an HTML entity to a character."""
770 entity = entity_with_semicolon[:-1]
771
772 # Known non-numeric HTML entity
773 if entity in compat_html_entities.name2codepoint:
774 return compat_chr(compat_html_entities.name2codepoint[entity])
775
776 # TODO: HTML5 allows entities without a semicolon. For example,
777 # '&Eacuteric' should be decoded as 'Éric'.
778 if entity_with_semicolon in compat_html_entities_html5:
779 return compat_html_entities_html5[entity_with_semicolon]
780
781 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
782 if mobj is not None:
783 numstr = mobj.group(1)
784 if numstr.startswith('x'):
785 base = 16
786 numstr = '0%s' % numstr
787 else:
788 base = 10
789 # See https://github.com/ytdl-org/youtube-dl/issues/7518
790 with contextlib.suppress(ValueError):
791 return compat_chr(int(numstr, base))
792
793 # Unknown entity in name, return its literal representation
794 return '&%s;' % entity
795
796
797 def unescapeHTML(s):
798 if s is None:
799 return None
800 assert isinstance(s, str)
801
802 return re.sub(
803 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
804
805
806 def escapeHTML(text):
807 return (
808 text
809 .replace('&', '&amp;')
810 .replace('<', '&lt;')
811 .replace('>', '&gt;')
812 .replace('"', '&quot;')
813 .replace("'", '&#39;')
814 )
815
816
817 def process_communicate_or_kill(p, *args, **kwargs):
818 try:
819 return p.communicate(*args, **kwargs)
820 except BaseException: # Including KeyboardInterrupt
821 p.kill()
822 p.wait()
823 raise
824
825
826 class Popen(subprocess.Popen):
827 if sys.platform == 'win32':
828 _startupinfo = subprocess.STARTUPINFO()
829 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
830 else:
831 _startupinfo = None
832
833 def __init__(self, *args, **kwargs):
834 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
835
836 def communicate_or_kill(self, *args, **kwargs):
837 return process_communicate_or_kill(self, *args, **kwargs)
838
839
840 def get_subprocess_encoding():
841 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
842 # For subprocess calls, encode with locale encoding
843 # Refer to http://stackoverflow.com/a/9951851/35070
844 encoding = preferredencoding()
845 else:
846 encoding = sys.getfilesystemencoding()
847 if encoding is None:
848 encoding = 'utf-8'
849 return encoding
850
851
852 def encodeFilename(s, for_subprocess=False):
853 assert isinstance(s, str)
854 return s
855
856
857 def decodeFilename(b, for_subprocess=False):
858 return b
859
860
861 def encodeArgument(s):
862 # Legacy code that uses byte strings
863 # Uncomment the following line after fixing all post processors
864 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
865 return s if isinstance(s, str) else s.decode('ascii')
866
867
868 def decodeArgument(b):
869 return b
870
871
872 def decodeOption(optval):
873 if optval is None:
874 return optval
875 if isinstance(optval, bytes):
876 optval = optval.decode(preferredencoding())
877
878 assert isinstance(optval, compat_str)
879 return optval
880
881
882 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
883
884
885 def timetuple_from_msec(msec):
886 secs, msec = divmod(msec, 1000)
887 mins, secs = divmod(secs, 60)
888 hrs, mins = divmod(mins, 60)
889 return _timetuple(hrs, mins, secs, msec)
890
891
892 def formatSeconds(secs, delim=':', msec=False):
893 time = timetuple_from_msec(secs * 1000)
894 if time.hours:
895 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
896 elif time.minutes:
897 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
898 else:
899 ret = '%d' % time.seconds
900 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
901
902
903 def _ssl_load_windows_store_certs(ssl_context, storename):
904 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
905 try:
906 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
907 if encoding == 'x509_asn' and (
908 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
909 except PermissionError:
910 return
911 for cert in certs:
912 with contextlib.suppress(ssl.SSLError):
913 ssl_context.load_verify_locations(cadata=cert)
914
915
916 def make_HTTPS_handler(params, **kwargs):
917 opts_check_certificate = not params.get('nocheckcertificate')
918 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
919 context.check_hostname = opts_check_certificate
920 if params.get('legacyserverconnect'):
921 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
922 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
923 context.set_ciphers('DEFAULT')
924 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
925 if opts_check_certificate:
926 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
927 context.load_verify_locations(cafile=certifi.where())
928 else:
929 try:
930 context.load_default_certs()
931 # Work around the issue in load_default_certs when there are bad certificates. See:
932 # https://github.com/yt-dlp/yt-dlp/issues/1060,
933 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
934 except ssl.SSLError:
935 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
936 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
937 for storename in ('CA', 'ROOT'):
938 _ssl_load_windows_store_certs(context, storename)
939 context.set_default_verify_paths()
940 client_certfile = params.get('client_certificate')
941 if client_certfile:
942 try:
943 context.load_cert_chain(
944 client_certfile, keyfile=params.get('client_certificate_key'),
945 password=params.get('client_certificate_password'))
946 except ssl.SSLError:
947 raise YoutubeDLError('Unable to load client certificate')
948 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
949
950
951 def bug_reports_message(before=';'):
952 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
953 'filling out the appropriate issue template. '
954 'Confirm you are on the latest version using yt-dlp -U')
955
956 before = before.rstrip()
957 if not before or before.endswith(('.', '!', '?')):
958 msg = msg[0].title() + msg[1:]
959
960 return (before + ' ' if before else '') + msg
961
962
963 class YoutubeDLError(Exception):
964 """Base exception for YoutubeDL errors."""
965 msg = None
966
967 def __init__(self, msg=None):
968 if msg is not None:
969 self.msg = msg
970 elif self.msg is None:
971 self.msg = type(self).__name__
972 super().__init__(self.msg)
973
974
975 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
976 if hasattr(ssl, 'CertificateError'):
977 network_exceptions.append(ssl.CertificateError)
978 network_exceptions = tuple(network_exceptions)
979
980
981 class ExtractorError(YoutubeDLError):
982 """Error during info extraction."""
983
984 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
985 """ tb, if given, is the original traceback (so that it can be printed out).
986 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
987 """
988 if sys.exc_info()[0] in network_exceptions:
989 expected = True
990
991 self.orig_msg = str(msg)
992 self.traceback = tb
993 self.expected = expected
994 self.cause = cause
995 self.video_id = video_id
996 self.ie = ie
997 self.exc_info = sys.exc_info() # preserve original exception
998
999 super().__init__(''.join((
1000 format_field(ie, template='[%s] '),
1001 format_field(video_id, template='%s: '),
1002 msg,
1003 format_field(cause, template=' (caused by %r)'),
1004 '' if expected else bug_reports_message())))
1005
1006 def format_traceback(self):
1007 return join_nonempty(
1008 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1009 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1010 delim='\n') or None
1011
1012
1013 class UnsupportedError(ExtractorError):
1014 def __init__(self, url):
1015 super().__init__(
1016 'Unsupported URL: %s' % url, expected=True)
1017 self.url = url
1018
1019
1020 class RegexNotFoundError(ExtractorError):
1021 """Error when a regex didn't match"""
1022 pass
1023
1024
1025 class GeoRestrictedError(ExtractorError):
1026 """Geographic restriction Error exception.
1027
1028 This exception may be thrown when a video is not available from your
1029 geographic location due to geographic restrictions imposed by a website.
1030 """
1031
1032 def __init__(self, msg, countries=None, **kwargs):
1033 kwargs['expected'] = True
1034 super().__init__(msg, **kwargs)
1035 self.countries = countries
1036
1037
1038 class DownloadError(YoutubeDLError):
1039 """Download Error exception.
1040
1041 This exception may be thrown by FileDownloader objects if they are not
1042 configured to continue on errors. They will contain the appropriate
1043 error message.
1044 """
1045
1046 def __init__(self, msg, exc_info=None):
1047 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1048 super().__init__(msg)
1049 self.exc_info = exc_info
1050
1051
1052 class EntryNotInPlaylist(YoutubeDLError):
1053 """Entry not in playlist exception.
1054
1055 This exception will be thrown by YoutubeDL when a requested entry
1056 is not found in the playlist info_dict
1057 """
1058 msg = 'Entry not found in info'
1059
1060
1061 class SameFileError(YoutubeDLError):
1062 """Same File exception.
1063
1064 This exception will be thrown by FileDownloader objects if they detect
1065 multiple files would have to be downloaded to the same file on disk.
1066 """
1067 msg = 'Fixed output name but more than one file to download'
1068
1069 def __init__(self, filename=None):
1070 if filename is not None:
1071 self.msg += f': {filename}'
1072 super().__init__(self.msg)
1073
1074
1075 class PostProcessingError(YoutubeDLError):
1076 """Post Processing exception.
1077
1078 This exception may be raised by PostProcessor's .run() method to
1079 indicate an error in the postprocessing task.
1080 """
1081
1082
1083 class DownloadCancelled(YoutubeDLError):
1084 """ Exception raised when the download queue should be interrupted """
1085 msg = 'The download was cancelled'
1086
1087
1088 class ExistingVideoReached(DownloadCancelled):
1089 """ --break-on-existing triggered """
1090 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1091
1092
1093 class RejectedVideoReached(DownloadCancelled):
1094 """ --break-on-reject triggered """
1095 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1096
1097
1098 class MaxDownloadsReached(DownloadCancelled):
1099 """ --max-downloads limit has been reached. """
1100 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1101
1102
1103 class ReExtractInfo(YoutubeDLError):
1104 """ Video info needs to be re-extracted. """
1105
1106 def __init__(self, msg, expected=False):
1107 super().__init__(msg)
1108 self.expected = expected
1109
1110
1111 class ThrottledDownload(ReExtractInfo):
1112 """ Download speed below --throttled-rate. """
1113 msg = 'The download speed is below throttle limit'
1114
1115 def __init__(self):
1116 super().__init__(self.msg, expected=False)
1117
1118
1119 class UnavailableVideoError(YoutubeDLError):
1120 """Unavailable Format exception.
1121
1122 This exception will be thrown when a video is requested
1123 in a format that is not available for that video.
1124 """
1125 msg = 'Unable to download video'
1126
1127 def __init__(self, err=None):
1128 if err is not None:
1129 self.msg += f': {err}'
1130 super().__init__(self.msg)
1131
1132
1133 class ContentTooShortError(YoutubeDLError):
1134 """Content Too Short exception.
1135
1136 This exception may be raised by FileDownloader objects when a file they
1137 download is too small for what the server announced first, indicating
1138 the connection was probably interrupted.
1139 """
1140
1141 def __init__(self, downloaded, expected):
1142 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1143 # Both in bytes
1144 self.downloaded = downloaded
1145 self.expected = expected
1146
1147
1148 class XAttrMetadataError(YoutubeDLError):
1149 def __init__(self, code=None, msg='Unknown error'):
1150 super().__init__(msg)
1151 self.code = code
1152 self.msg = msg
1153
1154 # Parsing code and msg
1155 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1156 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1157 self.reason = 'NO_SPACE'
1158 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1159 self.reason = 'VALUE_TOO_LONG'
1160 else:
1161 self.reason = 'NOT_SUPPORTED'
1162
1163
1164 class XAttrUnavailableError(YoutubeDLError):
1165 pass
1166
1167
1168 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1169 hc = http_class(*args, **kwargs)
1170 source_address = ydl_handler._params.get('source_address')
1171
1172 if source_address is not None:
1173 # This is to workaround _create_connection() from socket where it will try all
1174 # address data from getaddrinfo() including IPv6. This filters the result from
1175 # getaddrinfo() based on the source_address value.
1176 # This is based on the cpython socket.create_connection() function.
1177 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1178 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1179 host, port = address
1180 err = None
1181 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1182 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1183 ip_addrs = [addr for addr in addrs if addr[0] == af]
1184 if addrs and not ip_addrs:
1185 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1186 raise OSError(
1187 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1188 % (ip_version, source_address[0]))
1189 for res in ip_addrs:
1190 af, socktype, proto, canonname, sa = res
1191 sock = None
1192 try:
1193 sock = socket.socket(af, socktype, proto)
1194 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1195 sock.settimeout(timeout)
1196 sock.bind(source_address)
1197 sock.connect(sa)
1198 err = None # Explicitly break reference cycle
1199 return sock
1200 except OSError as _:
1201 err = _
1202 if sock is not None:
1203 sock.close()
1204 if err is not None:
1205 raise err
1206 else:
1207 raise OSError('getaddrinfo returns an empty list')
1208 if hasattr(hc, '_create_connection'):
1209 hc._create_connection = _create_connection
1210 hc.source_address = (source_address, 0)
1211
1212 return hc
1213
1214
1215 def handle_youtubedl_headers(headers):
1216 filtered_headers = headers
1217
1218 if 'Youtubedl-no-compression' in filtered_headers:
1219 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1220 del filtered_headers['Youtubedl-no-compression']
1221
1222 return filtered_headers
1223
1224
1225 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1226 """Handler for HTTP requests and responses.
1227
1228 This class, when installed with an OpenerDirector, automatically adds
1229 the standard headers to every HTTP request and handles gzipped and
1230 deflated responses from web servers. If compression is to be avoided in
1231 a particular request, the original request in the program code only has
1232 to include the HTTP header "Youtubedl-no-compression", which will be
1233 removed before making the real request.
1234
1235 Part of this code was copied from:
1236
1237 http://techknack.net/python-urllib2-handlers/
1238
1239 Andrew Rowls, the author of that code, agreed to release it to the
1240 public domain.
1241 """
1242
1243 def __init__(self, params, *args, **kwargs):
1244 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1245 self._params = params
1246
1247 def http_open(self, req):
1248 conn_class = compat_http_client.HTTPConnection
1249
1250 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1251 if socks_proxy:
1252 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1253 del req.headers['Ytdl-socks-proxy']
1254
1255 return self.do_open(functools.partial(
1256 _create_http_connection, self, conn_class, False),
1257 req)
1258
1259 @staticmethod
1260 def deflate(data):
1261 if not data:
1262 return data
1263 try:
1264 return zlib.decompress(data, -zlib.MAX_WBITS)
1265 except zlib.error:
1266 return zlib.decompress(data)
1267
1268 @staticmethod
1269 def brotli(data):
1270 if not data:
1271 return data
1272 return brotli.decompress(data)
1273
1274 def http_request(self, req):
1275 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1276 # always respected by websites, some tend to give out URLs with non percent-encoded
1277 # non-ASCII characters (see telemb.py, ard.py [#3412])
1278 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1279 # To work around aforementioned issue we will replace request's original URL with
1280 # percent-encoded one
1281 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1282 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1283 url = req.get_full_url()
1284 url_escaped = escape_url(url)
1285
1286 # Substitute URL if any change after escaping
1287 if url != url_escaped:
1288 req = update_Request(req, url=url_escaped)
1289
1290 for h, v in self._params.get('http_headers', std_headers).items():
1291 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1292 # The dict keys are capitalized because of this bug by urllib
1293 if h.capitalize() not in req.headers:
1294 req.add_header(h, v)
1295
1296 if 'Accept-encoding' not in req.headers:
1297 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1298
1299 req.headers = handle_youtubedl_headers(req.headers)
1300
1301 return req
1302
1303 def http_response(self, req, resp):
1304 old_resp = resp
1305 # gzip
1306 if resp.headers.get('Content-encoding', '') == 'gzip':
1307 content = resp.read()
1308 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1309 try:
1310 uncompressed = io.BytesIO(gz.read())
1311 except OSError as original_ioerror:
1312 # There may be junk add the end of the file
1313 # See http://stackoverflow.com/q/4928560/35070 for details
1314 for i in range(1, 1024):
1315 try:
1316 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1317 uncompressed = io.BytesIO(gz.read())
1318 except OSError:
1319 continue
1320 break
1321 else:
1322 raise original_ioerror
1323 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1324 resp.msg = old_resp.msg
1325 del resp.headers['Content-encoding']
1326 # deflate
1327 if resp.headers.get('Content-encoding', '') == 'deflate':
1328 gz = io.BytesIO(self.deflate(resp.read()))
1329 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1330 resp.msg = old_resp.msg
1331 del resp.headers['Content-encoding']
1332 # brotli
1333 if resp.headers.get('Content-encoding', '') == 'br':
1334 resp = compat_urllib_request.addinfourl(
1335 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1336 resp.msg = old_resp.msg
1337 del resp.headers['Content-encoding']
1338 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1339 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1340 if 300 <= resp.code < 400:
1341 location = resp.headers.get('Location')
1342 if location:
1343 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1344 location = location.encode('iso-8859-1').decode()
1345 location_escaped = escape_url(location)
1346 if location != location_escaped:
1347 del resp.headers['Location']
1348 resp.headers['Location'] = location_escaped
1349 return resp
1350
1351 https_request = http_request
1352 https_response = http_response
1353
1354
1355 def make_socks_conn_class(base_class, socks_proxy):
1356 assert issubclass(base_class, (
1357 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1358
1359 url_components = compat_urlparse.urlparse(socks_proxy)
1360 if url_components.scheme.lower() == 'socks5':
1361 socks_type = ProxyType.SOCKS5
1362 elif url_components.scheme.lower() in ('socks', 'socks4'):
1363 socks_type = ProxyType.SOCKS4
1364 elif url_components.scheme.lower() == 'socks4a':
1365 socks_type = ProxyType.SOCKS4A
1366
1367 def unquote_if_non_empty(s):
1368 if not s:
1369 return s
1370 return compat_urllib_parse_unquote_plus(s)
1371
1372 proxy_args = (
1373 socks_type,
1374 url_components.hostname, url_components.port or 1080,
1375 True, # Remote DNS
1376 unquote_if_non_empty(url_components.username),
1377 unquote_if_non_empty(url_components.password),
1378 )
1379
1380 class SocksConnection(base_class):
1381 def connect(self):
1382 self.sock = sockssocket()
1383 self.sock.setproxy(*proxy_args)
1384 if isinstance(self.timeout, (int, float)):
1385 self.sock.settimeout(self.timeout)
1386 self.sock.connect((self.host, self.port))
1387
1388 if isinstance(self, compat_http_client.HTTPSConnection):
1389 if hasattr(self, '_context'): # Python > 2.6
1390 self.sock = self._context.wrap_socket(
1391 self.sock, server_hostname=self.host)
1392 else:
1393 self.sock = ssl.wrap_socket(self.sock)
1394
1395 return SocksConnection
1396
1397
1398 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1399 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1400 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1401 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1402 self._params = params
1403
1404 def https_open(self, req):
1405 kwargs = {}
1406 conn_class = self._https_conn_class
1407
1408 if hasattr(self, '_context'): # python > 2.6
1409 kwargs['context'] = self._context
1410 if hasattr(self, '_check_hostname'): # python 3.x
1411 kwargs['check_hostname'] = self._check_hostname
1412
1413 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1414 if socks_proxy:
1415 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1416 del req.headers['Ytdl-socks-proxy']
1417
1418 try:
1419 return self.do_open(
1420 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1421 except urllib.error.URLError as e:
1422 if (isinstance(e.reason, ssl.SSLError)
1423 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1424 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1425 raise
1426
1427
1428 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1429 """
1430 See [1] for cookie file format.
1431
1432 1. https://curl.haxx.se/docs/http-cookies.html
1433 """
1434 _HTTPONLY_PREFIX = '#HttpOnly_'
1435 _ENTRY_LEN = 7
1436 _HEADER = '''# Netscape HTTP Cookie File
1437 # This file is generated by yt-dlp. Do not edit.
1438
1439 '''
1440 _CookieFileEntry = collections.namedtuple(
1441 'CookieFileEntry',
1442 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1443
1444 def __init__(self, filename=None, *args, **kwargs):
1445 super().__init__(None, *args, **kwargs)
1446 if self.is_path(filename):
1447 filename = os.fspath(filename)
1448 self.filename = filename
1449
1450 @staticmethod
1451 def _true_or_false(cndn):
1452 return 'TRUE' if cndn else 'FALSE'
1453
1454 @staticmethod
1455 def is_path(file):
1456 return isinstance(file, (str, bytes, os.PathLike))
1457
1458 @contextlib.contextmanager
1459 def open(self, file, *, write=False):
1460 if self.is_path(file):
1461 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1462 yield f
1463 else:
1464 if write:
1465 file.truncate(0)
1466 yield file
1467
1468 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1469 now = time.time()
1470 for cookie in self:
1471 if (not ignore_discard and cookie.discard
1472 or not ignore_expires and cookie.is_expired(now)):
1473 continue
1474 name, value = cookie.name, cookie.value
1475 if value is None:
1476 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1477 # with no name, whereas http.cookiejar regards it as a
1478 # cookie with no value.
1479 name, value = '', name
1480 f.write('%s\n' % '\t'.join((
1481 cookie.domain,
1482 self._true_or_false(cookie.domain.startswith('.')),
1483 cookie.path,
1484 self._true_or_false(cookie.secure),
1485 str_or_none(cookie.expires, default=''),
1486 name, value
1487 )))
1488
1489 def save(self, filename=None, *args, **kwargs):
1490 """
1491 Save cookies to a file.
1492 Code is taken from CPython 3.6
1493 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1494
1495 if filename is None:
1496 if self.filename is not None:
1497 filename = self.filename
1498 else:
1499 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1500
1501 # Store session cookies with `expires` set to 0 instead of an empty string
1502 for cookie in self:
1503 if cookie.expires is None:
1504 cookie.expires = 0
1505
1506 with self.open(filename, write=True) as f:
1507 f.write(self._HEADER)
1508 self._really_save(f, *args, **kwargs)
1509
1510 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1511 """Load cookies from a file."""
1512 if filename is None:
1513 if self.filename is not None:
1514 filename = self.filename
1515 else:
1516 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1517
1518 def prepare_line(line):
1519 if line.startswith(self._HTTPONLY_PREFIX):
1520 line = line[len(self._HTTPONLY_PREFIX):]
1521 # comments and empty lines are fine
1522 if line.startswith('#') or not line.strip():
1523 return line
1524 cookie_list = line.split('\t')
1525 if len(cookie_list) != self._ENTRY_LEN:
1526 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1527 cookie = self._CookieFileEntry(*cookie_list)
1528 if cookie.expires_at and not cookie.expires_at.isdigit():
1529 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1530 return line
1531
1532 cf = io.StringIO()
1533 with self.open(filename) as f:
1534 for line in f:
1535 try:
1536 cf.write(prepare_line(line))
1537 except compat_cookiejar.LoadError as e:
1538 if f'{line.strip()} '[0] in '[{"':
1539 raise compat_cookiejar.LoadError(
1540 'Cookies file must be Netscape formatted, not JSON. See '
1541 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1542 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1543 continue
1544 cf.seek(0)
1545 self._really_load(cf, filename, ignore_discard, ignore_expires)
1546 # Session cookies are denoted by either `expires` field set to
1547 # an empty string or 0. MozillaCookieJar only recognizes the former
1548 # (see [1]). So we need force the latter to be recognized as session
1549 # cookies on our own.
1550 # Session cookies may be important for cookies-based authentication,
1551 # e.g. usually, when user does not check 'Remember me' check box while
1552 # logging in on a site, some important cookies are stored as session
1553 # cookies so that not recognizing them will result in failed login.
1554 # 1. https://bugs.python.org/issue17164
1555 for cookie in self:
1556 # Treat `expires=0` cookies as session cookies
1557 if cookie.expires == 0:
1558 cookie.expires = None
1559 cookie.discard = True
1560
1561
1562 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1563 def __init__(self, cookiejar=None):
1564 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1565
1566 def http_response(self, request, response):
1567 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1568
1569 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1570 https_response = http_response
1571
1572
1573 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1574 """YoutubeDL redirect handler
1575
1576 The code is based on HTTPRedirectHandler implementation from CPython [1].
1577
1578 This redirect handler solves two issues:
1579 - ensures redirect URL is always unicode under python 2
1580 - introduces support for experimental HTTP response status code
1581 308 Permanent Redirect [2] used by some sites [3]
1582
1583 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1584 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1585 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1586 """
1587
1588 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1589
1590 def redirect_request(self, req, fp, code, msg, headers, newurl):
1591 """Return a Request or None in response to a redirect.
1592
1593 This is called by the http_error_30x methods when a
1594 redirection response is received. If a redirection should
1595 take place, return a new Request to allow http_error_30x to
1596 perform the redirect. Otherwise, raise HTTPError if no-one
1597 else should try to handle this url. Return None if you can't
1598 but another Handler might.
1599 """
1600 m = req.get_method()
1601 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1602 or code in (301, 302, 303) and m == "POST")):
1603 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1604 # Strictly (according to RFC 2616), 301 or 302 in response to
1605 # a POST MUST NOT cause a redirection without confirmation
1606 # from the user (of urllib.request, in this case). In practice,
1607 # essentially all clients do redirect in this case, so we do
1608 # the same.
1609
1610 # Be conciliant with URIs containing a space. This is mainly
1611 # redundant with the more complete encoding done in http_error_302(),
1612 # but it is kept for compatibility with other callers.
1613 newurl = newurl.replace(' ', '%20')
1614
1615 CONTENT_HEADERS = ("content-length", "content-type")
1616 # NB: don't use dict comprehension for python 2.6 compatibility
1617 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1618
1619 # A 303 must either use GET or HEAD for subsequent request
1620 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1621 if code == 303 and m != 'HEAD':
1622 m = 'GET'
1623 # 301 and 302 redirects are commonly turned into a GET from a POST
1624 # for subsequent requests by browsers, so we'll do the same.
1625 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1626 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1627 if code in (301, 302) and m == 'POST':
1628 m = 'GET'
1629
1630 return compat_urllib_request.Request(
1631 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1632 unverifiable=True, method=m)
1633
1634
1635 def extract_timezone(date_str):
1636 m = re.search(
1637 r'''(?x)
1638 ^.{8,}? # >=8 char non-TZ prefix, if present
1639 (?P<tz>Z| # just the UTC Z, or
1640 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1641 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1642 [ ]? # optional space
1643 (?P<sign>\+|-) # +/-
1644 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1645 $)
1646 ''', date_str)
1647 if not m:
1648 timezone = datetime.timedelta()
1649 else:
1650 date_str = date_str[:-len(m.group('tz'))]
1651 if not m.group('sign'):
1652 timezone = datetime.timedelta()
1653 else:
1654 sign = 1 if m.group('sign') == '+' else -1
1655 timezone = datetime.timedelta(
1656 hours=sign * int(m.group('hours')),
1657 minutes=sign * int(m.group('minutes')))
1658 return timezone, date_str
1659
1660
1661 def parse_iso8601(date_str, delimiter='T', timezone=None):
1662 """ Return a UNIX timestamp from the given date """
1663
1664 if date_str is None:
1665 return None
1666
1667 date_str = re.sub(r'\.[0-9]+', '', date_str)
1668
1669 if timezone is None:
1670 timezone, date_str = extract_timezone(date_str)
1671
1672 with contextlib.suppress(ValueError):
1673 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1674 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1675 return calendar.timegm(dt.timetuple())
1676
1677
1678 def date_formats(day_first=True):
1679 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1680
1681
1682 def unified_strdate(date_str, day_first=True):
1683 """Return a string with the date in the format YYYYMMDD"""
1684
1685 if date_str is None:
1686 return None
1687 upload_date = None
1688 # Replace commas
1689 date_str = date_str.replace(',', ' ')
1690 # Remove AM/PM + timezone
1691 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1692 _, date_str = extract_timezone(date_str)
1693
1694 for expression in date_formats(day_first):
1695 with contextlib.suppress(ValueError):
1696 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1697 if upload_date is None:
1698 timetuple = email.utils.parsedate_tz(date_str)
1699 if timetuple:
1700 with contextlib.suppress(ValueError):
1701 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1702 if upload_date is not None:
1703 return compat_str(upload_date)
1704
1705
1706 def unified_timestamp(date_str, day_first=True):
1707 if date_str is None:
1708 return None
1709
1710 date_str = re.sub(r'[,|]', '', date_str)
1711
1712 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1713 timezone, date_str = extract_timezone(date_str)
1714
1715 # Remove AM/PM + timezone
1716 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1717
1718 # Remove unrecognized timezones from ISO 8601 alike timestamps
1719 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1720 if m:
1721 date_str = date_str[:-len(m.group('tz'))]
1722
1723 # Python only supports microseconds, so remove nanoseconds
1724 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1725 if m:
1726 date_str = m.group(1)
1727
1728 for expression in date_formats(day_first):
1729 with contextlib.suppress(ValueError):
1730 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1731 return calendar.timegm(dt.timetuple())
1732 timetuple = email.utils.parsedate_tz(date_str)
1733 if timetuple:
1734 return calendar.timegm(timetuple) + pm_delta * 3600
1735
1736
1737 def determine_ext(url, default_ext='unknown_video'):
1738 if url is None or '.' not in url:
1739 return default_ext
1740 guess = url.partition('?')[0].rpartition('.')[2]
1741 if re.match(r'^[A-Za-z0-9]+$', guess):
1742 return guess
1743 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1744 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1745 return guess.rstrip('/')
1746 else:
1747 return default_ext
1748
1749
1750 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1751 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1752
1753
1754 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1755 R"""
1756 Return a datetime object from a string.
1757 Supported format:
1758 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1759
1760 @param format strftime format of DATE
1761 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1762 auto: round to the unit provided in date_str (if applicable).
1763 """
1764 auto_precision = False
1765 if precision == 'auto':
1766 auto_precision = True
1767 precision = 'microsecond'
1768 today = datetime_round(datetime.datetime.utcnow(), precision)
1769 if date_str in ('now', 'today'):
1770 return today
1771 if date_str == 'yesterday':
1772 return today - datetime.timedelta(days=1)
1773 match = re.match(
1774 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1775 date_str)
1776 if match is not None:
1777 start_time = datetime_from_str(match.group('start'), precision, format)
1778 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1779 unit = match.group('unit')
1780 if unit == 'month' or unit == 'year':
1781 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1782 unit = 'day'
1783 else:
1784 if unit == 'week':
1785 unit = 'day'
1786 time *= 7
1787 delta = datetime.timedelta(**{unit + 's': time})
1788 new_date = start_time + delta
1789 if auto_precision:
1790 return datetime_round(new_date, unit)
1791 return new_date
1792
1793 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1794
1795
1796 def date_from_str(date_str, format='%Y%m%d', strict=False):
1797 R"""
1798 Return a date object from a string using datetime_from_str
1799
1800 @param strict Restrict allowed patterns to "YYYYMMDD" and
1801 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1802 """
1803 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1804 raise ValueError(f'Invalid date format "{date_str}"')
1805 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1806
1807
1808 def datetime_add_months(dt, months):
1809 """Increment/Decrement a datetime object by months."""
1810 month = dt.month + months - 1
1811 year = dt.year + month // 12
1812 month = month % 12 + 1
1813 day = min(dt.day, calendar.monthrange(year, month)[1])
1814 return dt.replace(year, month, day)
1815
1816
1817 def datetime_round(dt, precision='day'):
1818 """
1819 Round a datetime object's time to a specific precision
1820 """
1821 if precision == 'microsecond':
1822 return dt
1823
1824 unit_seconds = {
1825 'day': 86400,
1826 'hour': 3600,
1827 'minute': 60,
1828 'second': 1,
1829 }
1830 roundto = lambda x, n: ((x + n / 2) // n) * n
1831 timestamp = calendar.timegm(dt.timetuple())
1832 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1833
1834
1835 def hyphenate_date(date_str):
1836 """
1837 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1838 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1839 if match is not None:
1840 return '-'.join(match.groups())
1841 else:
1842 return date_str
1843
1844
1845 class DateRange:
1846 """Represents a time interval between two dates"""
1847
1848 def __init__(self, start=None, end=None):
1849 """start and end must be strings in the format accepted by date"""
1850 if start is not None:
1851 self.start = date_from_str(start, strict=True)
1852 else:
1853 self.start = datetime.datetime.min.date()
1854 if end is not None:
1855 self.end = date_from_str(end, strict=True)
1856 else:
1857 self.end = datetime.datetime.max.date()
1858 if self.start > self.end:
1859 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1860
1861 @classmethod
1862 def day(cls, day):
1863 """Returns a range that only contains the given day"""
1864 return cls(day, day)
1865
1866 def __contains__(self, date):
1867 """Check if the date is in the range"""
1868 if not isinstance(date, datetime.date):
1869 date = date_from_str(date)
1870 return self.start <= date <= self.end
1871
1872 def __str__(self):
1873 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1874
1875
1876 def platform_name():
1877 """ Returns the platform name as a compat_str """
1878 res = platform.platform()
1879 if isinstance(res, bytes):
1880 res = res.decode(preferredencoding())
1881
1882 assert isinstance(res, compat_str)
1883 return res
1884
1885
1886 def get_windows_version():
1887 ''' Get Windows version. None if it's not running on Windows '''
1888 if compat_os_name == 'nt':
1889 return version_tuple(platform.win32_ver()[1])
1890 else:
1891 return None
1892
1893
1894 def write_string(s, out=None, encoding=None):
1895 assert isinstance(s, str)
1896 out = out or sys.stderr
1897
1898 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1899 s = re.sub(r'([\r\n]+)', r' \1', s)
1900
1901 if 'b' in getattr(out, 'mode', ''):
1902 byt = s.encode(encoding or preferredencoding(), 'ignore')
1903 out.write(byt)
1904 elif hasattr(out, 'buffer'):
1905 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1906 byt = s.encode(enc, 'ignore')
1907 out.buffer.write(byt)
1908 else:
1909 out.write(s)
1910 out.flush()
1911
1912
1913 def bytes_to_intlist(bs):
1914 if not bs:
1915 return []
1916 if isinstance(bs[0], int): # Python 3
1917 return list(bs)
1918 else:
1919 return [ord(c) for c in bs]
1920
1921
1922 def intlist_to_bytes(xs):
1923 if not xs:
1924 return b''
1925 return compat_struct_pack('%dB' % len(xs), *xs)
1926
1927
1928 class LockingUnsupportedError(IOError):
1929 msg = 'File locking is not supported on this platform'
1930
1931 def __init__(self):
1932 super().__init__(self.msg)
1933
1934
1935 # Cross-platform file locking
1936 if sys.platform == 'win32':
1937 import ctypes.wintypes
1938 import msvcrt
1939
1940 class OVERLAPPED(ctypes.Structure):
1941 _fields_ = [
1942 ('Internal', ctypes.wintypes.LPVOID),
1943 ('InternalHigh', ctypes.wintypes.LPVOID),
1944 ('Offset', ctypes.wintypes.DWORD),
1945 ('OffsetHigh', ctypes.wintypes.DWORD),
1946 ('hEvent', ctypes.wintypes.HANDLE),
1947 ]
1948
1949 kernel32 = ctypes.windll.kernel32
1950 LockFileEx = kernel32.LockFileEx
1951 LockFileEx.argtypes = [
1952 ctypes.wintypes.HANDLE, # hFile
1953 ctypes.wintypes.DWORD, # dwFlags
1954 ctypes.wintypes.DWORD, # dwReserved
1955 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1956 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1957 ctypes.POINTER(OVERLAPPED) # Overlapped
1958 ]
1959 LockFileEx.restype = ctypes.wintypes.BOOL
1960 UnlockFileEx = kernel32.UnlockFileEx
1961 UnlockFileEx.argtypes = [
1962 ctypes.wintypes.HANDLE, # hFile
1963 ctypes.wintypes.DWORD, # dwReserved
1964 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1965 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1966 ctypes.POINTER(OVERLAPPED) # Overlapped
1967 ]
1968 UnlockFileEx.restype = ctypes.wintypes.BOOL
1969 whole_low = 0xffffffff
1970 whole_high = 0x7fffffff
1971
1972 def _lock_file(f, exclusive, block):
1973 overlapped = OVERLAPPED()
1974 overlapped.Offset = 0
1975 overlapped.OffsetHigh = 0
1976 overlapped.hEvent = 0
1977 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1978
1979 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1980 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1981 0, whole_low, whole_high, f._lock_file_overlapped_p):
1982 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
1983
1984 def _unlock_file(f):
1985 assert f._lock_file_overlapped_p
1986 handle = msvcrt.get_osfhandle(f.fileno())
1987 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1988 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1989
1990 else:
1991 try:
1992 import fcntl
1993
1994 def _lock_file(f, exclusive, block):
1995 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1996 if not block:
1997 flags |= fcntl.LOCK_NB
1998 try:
1999 fcntl.flock(f, flags)
2000 except BlockingIOError:
2001 raise
2002 except OSError: # AOSP does not have flock()
2003 fcntl.lockf(f, flags)
2004
2005 def _unlock_file(f):
2006 try:
2007 fcntl.flock(f, fcntl.LOCK_UN)
2008 except OSError:
2009 fcntl.lockf(f, fcntl.LOCK_UN)
2010
2011 except ImportError:
2012
2013 def _lock_file(f, exclusive, block):
2014 raise LockingUnsupportedError()
2015
2016 def _unlock_file(f):
2017 raise LockingUnsupportedError()
2018
2019
2020 class locked_file:
2021 locked = False
2022
2023 def __init__(self, filename, mode, block=True, encoding=None):
2024 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2025 raise NotImplementedError(mode)
2026 self.mode, self.block = mode, block
2027
2028 writable = any(f in mode for f in 'wax+')
2029 readable = any(f in mode for f in 'r+')
2030 flags = functools.reduce(operator.ior, (
2031 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2032 getattr(os, 'O_BINARY', 0), # Windows only
2033 getattr(os, 'O_NOINHERIT', 0), # Windows only
2034 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2035 os.O_APPEND if 'a' in mode else 0,
2036 os.O_EXCL if 'x' in mode else 0,
2037 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2038 ))
2039
2040 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2041
2042 def __enter__(self):
2043 exclusive = 'r' not in self.mode
2044 try:
2045 _lock_file(self.f, exclusive, self.block)
2046 self.locked = True
2047 except OSError:
2048 self.f.close()
2049 raise
2050 if 'w' in self.mode:
2051 try:
2052 self.f.truncate()
2053 except OSError as e:
2054 if e.errno != 29: # Illegal seek, expected when self.f is a FIFO
2055 raise e
2056 return self
2057
2058 def unlock(self):
2059 if not self.locked:
2060 return
2061 try:
2062 _unlock_file(self.f)
2063 finally:
2064 self.locked = False
2065
2066 def __exit__(self, *_):
2067 try:
2068 self.unlock()
2069 finally:
2070 self.f.close()
2071
2072 open = __enter__
2073 close = __exit__
2074
2075 def __getattr__(self, attr):
2076 return getattr(self.f, attr)
2077
2078 def __iter__(self):
2079 return iter(self.f)
2080
2081
2082 def get_filesystem_encoding():
2083 encoding = sys.getfilesystemencoding()
2084 return encoding if encoding is not None else 'utf-8'
2085
2086
2087 def shell_quote(args):
2088 quoted_args = []
2089 encoding = get_filesystem_encoding()
2090 for a in args:
2091 if isinstance(a, bytes):
2092 # We may get a filename encoded with 'encodeFilename'
2093 a = a.decode(encoding)
2094 quoted_args.append(compat_shlex_quote(a))
2095 return ' '.join(quoted_args)
2096
2097
2098 def smuggle_url(url, data):
2099 """ Pass additional data in a URL for internal use. """
2100
2101 url, idata = unsmuggle_url(url, {})
2102 data.update(idata)
2103 sdata = compat_urllib_parse_urlencode(
2104 {'__youtubedl_smuggle': json.dumps(data)})
2105 return url + '#' + sdata
2106
2107
2108 def unsmuggle_url(smug_url, default=None):
2109 if '#__youtubedl_smuggle' not in smug_url:
2110 return smug_url, default
2111 url, _, sdata = smug_url.rpartition('#')
2112 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2113 data = json.loads(jsond)
2114 return url, data
2115
2116
2117 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2118 """ Formats numbers with decimal sufixes like K, M, etc """
2119 num, factor = float_or_none(num), float(factor)
2120 if num is None or num < 0:
2121 return None
2122 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2123 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2124 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2125 if factor == 1024:
2126 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2127 converted = num / (factor ** exponent)
2128 return fmt % (converted, suffix)
2129
2130
2131 def format_bytes(bytes):
2132 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2133
2134
2135 def lookup_unit_table(unit_table, s):
2136 units_re = '|'.join(re.escape(u) for u in unit_table)
2137 m = re.match(
2138 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2139 if not m:
2140 return None
2141 num_str = m.group('num').replace(',', '.')
2142 mult = unit_table[m.group('unit')]
2143 return int(float(num_str) * mult)
2144
2145
2146 def parse_filesize(s):
2147 if s is None:
2148 return None
2149
2150 # The lower-case forms are of course incorrect and unofficial,
2151 # but we support those too
2152 _UNIT_TABLE = {
2153 'B': 1,
2154 'b': 1,
2155 'bytes': 1,
2156 'KiB': 1024,
2157 'KB': 1000,
2158 'kB': 1024,
2159 'Kb': 1000,
2160 'kb': 1000,
2161 'kilobytes': 1000,
2162 'kibibytes': 1024,
2163 'MiB': 1024 ** 2,
2164 'MB': 1000 ** 2,
2165 'mB': 1024 ** 2,
2166 'Mb': 1000 ** 2,
2167 'mb': 1000 ** 2,
2168 'megabytes': 1000 ** 2,
2169 'mebibytes': 1024 ** 2,
2170 'GiB': 1024 ** 3,
2171 'GB': 1000 ** 3,
2172 'gB': 1024 ** 3,
2173 'Gb': 1000 ** 3,
2174 'gb': 1000 ** 3,
2175 'gigabytes': 1000 ** 3,
2176 'gibibytes': 1024 ** 3,
2177 'TiB': 1024 ** 4,
2178 'TB': 1000 ** 4,
2179 'tB': 1024 ** 4,
2180 'Tb': 1000 ** 4,
2181 'tb': 1000 ** 4,
2182 'terabytes': 1000 ** 4,
2183 'tebibytes': 1024 ** 4,
2184 'PiB': 1024 ** 5,
2185 'PB': 1000 ** 5,
2186 'pB': 1024 ** 5,
2187 'Pb': 1000 ** 5,
2188 'pb': 1000 ** 5,
2189 'petabytes': 1000 ** 5,
2190 'pebibytes': 1024 ** 5,
2191 'EiB': 1024 ** 6,
2192 'EB': 1000 ** 6,
2193 'eB': 1024 ** 6,
2194 'Eb': 1000 ** 6,
2195 'eb': 1000 ** 6,
2196 'exabytes': 1000 ** 6,
2197 'exbibytes': 1024 ** 6,
2198 'ZiB': 1024 ** 7,
2199 'ZB': 1000 ** 7,
2200 'zB': 1024 ** 7,
2201 'Zb': 1000 ** 7,
2202 'zb': 1000 ** 7,
2203 'zettabytes': 1000 ** 7,
2204 'zebibytes': 1024 ** 7,
2205 'YiB': 1024 ** 8,
2206 'YB': 1000 ** 8,
2207 'yB': 1024 ** 8,
2208 'Yb': 1000 ** 8,
2209 'yb': 1000 ** 8,
2210 'yottabytes': 1000 ** 8,
2211 'yobibytes': 1024 ** 8,
2212 }
2213
2214 return lookup_unit_table(_UNIT_TABLE, s)
2215
2216
2217 def parse_count(s):
2218 if s is None:
2219 return None
2220
2221 s = re.sub(r'^[^\d]+\s', '', s).strip()
2222
2223 if re.match(r'^[\d,.]+$', s):
2224 return str_to_int(s)
2225
2226 _UNIT_TABLE = {
2227 'k': 1000,
2228 'K': 1000,
2229 'm': 1000 ** 2,
2230 'M': 1000 ** 2,
2231 'kk': 1000 ** 2,
2232 'KK': 1000 ** 2,
2233 'b': 1000 ** 3,
2234 'B': 1000 ** 3,
2235 }
2236
2237 ret = lookup_unit_table(_UNIT_TABLE, s)
2238 if ret is not None:
2239 return ret
2240
2241 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2242 if mobj:
2243 return str_to_int(mobj.group(1))
2244
2245
2246 def parse_resolution(s, *, lenient=False):
2247 if s is None:
2248 return {}
2249
2250 if lenient:
2251 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2252 else:
2253 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2254 if mobj:
2255 return {
2256 'width': int(mobj.group('w')),
2257 'height': int(mobj.group('h')),
2258 }
2259
2260 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2261 if mobj:
2262 return {'height': int(mobj.group(1))}
2263
2264 mobj = re.search(r'\b([48])[kK]\b', s)
2265 if mobj:
2266 return {'height': int(mobj.group(1)) * 540}
2267
2268 return {}
2269
2270
2271 def parse_bitrate(s):
2272 if not isinstance(s, compat_str):
2273 return
2274 mobj = re.search(r'\b(\d+)\s*kbps', s)
2275 if mobj:
2276 return int(mobj.group(1))
2277
2278
2279 def month_by_name(name, lang='en'):
2280 """ Return the number of a month by (locale-independently) English name """
2281
2282 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2283
2284 try:
2285 return month_names.index(name) + 1
2286 except ValueError:
2287 return None
2288
2289
2290 def month_by_abbreviation(abbrev):
2291 """ Return the number of a month by (locale-independently) English
2292 abbreviations """
2293
2294 try:
2295 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2296 except ValueError:
2297 return None
2298
2299
2300 def fix_xml_ampersands(xml_str):
2301 """Replace all the '&' by '&amp;' in XML"""
2302 return re.sub(
2303 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2304 '&amp;',
2305 xml_str)
2306
2307
2308 def setproctitle(title):
2309 assert isinstance(title, compat_str)
2310
2311 # ctypes in Jython is not complete
2312 # http://bugs.jython.org/issue2148
2313 if sys.platform.startswith('java'):
2314 return
2315
2316 try:
2317 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2318 except OSError:
2319 return
2320 except TypeError:
2321 # LoadLibrary in Windows Python 2.7.13 only expects
2322 # a bytestring, but since unicode_literals turns
2323 # every string into a unicode string, it fails.
2324 return
2325 title_bytes = title.encode()
2326 buf = ctypes.create_string_buffer(len(title_bytes))
2327 buf.value = title_bytes
2328 try:
2329 libc.prctl(15, buf, 0, 0, 0)
2330 except AttributeError:
2331 return # Strange libc, just skip this
2332
2333
2334 def remove_start(s, start):
2335 return s[len(start):] if s is not None and s.startswith(start) else s
2336
2337
2338 def remove_end(s, end):
2339 return s[:-len(end)] if s is not None and s.endswith(end) else s
2340
2341
2342 def remove_quotes(s):
2343 if s is None or len(s) < 2:
2344 return s
2345 for quote in ('"', "'", ):
2346 if s[0] == quote and s[-1] == quote:
2347 return s[1:-1]
2348 return s
2349
2350
2351 def get_domain(url):
2352 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2353 return domain.group('domain') if domain else None
2354
2355
2356 def url_basename(url):
2357 path = compat_urlparse.urlparse(url).path
2358 return path.strip('/').split('/')[-1]
2359
2360
2361 def base_url(url):
2362 return re.match(r'https?://[^?#&]+/', url).group()
2363
2364
2365 def urljoin(base, path):
2366 if isinstance(path, bytes):
2367 path = path.decode()
2368 if not isinstance(path, compat_str) or not path:
2369 return None
2370 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2371 return path
2372 if isinstance(base, bytes):
2373 base = base.decode()
2374 if not isinstance(base, compat_str) or not re.match(
2375 r'^(?:https?:)?//', base):
2376 return None
2377 return compat_urlparse.urljoin(base, path)
2378
2379
2380 class HEADRequest(compat_urllib_request.Request):
2381 def get_method(self):
2382 return 'HEAD'
2383
2384
2385 class PUTRequest(compat_urllib_request.Request):
2386 def get_method(self):
2387 return 'PUT'
2388
2389
2390 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2391 if get_attr and v is not None:
2392 v = getattr(v, get_attr, None)
2393 try:
2394 return int(v) * invscale // scale
2395 except (ValueError, TypeError, OverflowError):
2396 return default
2397
2398
2399 def str_or_none(v, default=None):
2400 return default if v is None else compat_str(v)
2401
2402
2403 def str_to_int(int_str):
2404 """ A more relaxed version of int_or_none """
2405 if isinstance(int_str, int):
2406 return int_str
2407 elif isinstance(int_str, compat_str):
2408 int_str = re.sub(r'[,\.\+]', '', int_str)
2409 return int_or_none(int_str)
2410
2411
2412 def float_or_none(v, scale=1, invscale=1, default=None):
2413 if v is None:
2414 return default
2415 try:
2416 return float(v) * invscale / scale
2417 except (ValueError, TypeError):
2418 return default
2419
2420
2421 def bool_or_none(v, default=None):
2422 return v if isinstance(v, bool) else default
2423
2424
2425 def strip_or_none(v, default=None):
2426 return v.strip() if isinstance(v, compat_str) else default
2427
2428
2429 def url_or_none(url):
2430 if not url or not isinstance(url, compat_str):
2431 return None
2432 url = url.strip()
2433 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2434
2435
2436 def request_to_url(req):
2437 if isinstance(req, compat_urllib_request.Request):
2438 return req.get_full_url()
2439 else:
2440 return req
2441
2442
2443 def strftime_or_none(timestamp, date_format, default=None):
2444 datetime_object = None
2445 try:
2446 if isinstance(timestamp, (int, float)): # unix timestamp
2447 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2448 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2449 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2450 return datetime_object.strftime(date_format)
2451 except (ValueError, TypeError, AttributeError):
2452 return default
2453
2454
2455 def parse_duration(s):
2456 if not isinstance(s, str):
2457 return None
2458 s = s.strip()
2459 if not s:
2460 return None
2461
2462 days, hours, mins, secs, ms = [None] * 5
2463 m = re.match(r'''(?x)
2464 (?P<before_secs>
2465 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2466 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2467 (?P<ms>[.:][0-9]+)?Z?$
2468 ''', s)
2469 if m:
2470 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2471 else:
2472 m = re.match(
2473 r'''(?ix)(?:P?
2474 (?:
2475 [0-9]+\s*y(?:ears?)?,?\s*
2476 )?
2477 (?:
2478 [0-9]+\s*m(?:onths?)?,?\s*
2479 )?
2480 (?:
2481 [0-9]+\s*w(?:eeks?)?,?\s*
2482 )?
2483 (?:
2484 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2485 )?
2486 T)?
2487 (?:
2488 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2489 )?
2490 (?:
2491 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2492 )?
2493 (?:
2494 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2495 )?Z?$''', s)
2496 if m:
2497 days, hours, mins, secs, ms = m.groups()
2498 else:
2499 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2500 if m:
2501 hours, mins = m.groups()
2502 else:
2503 return None
2504
2505 if ms:
2506 ms = ms.replace(':', '.')
2507 return sum(float(part or 0) * mult for part, mult in (
2508 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2509
2510
2511 def prepend_extension(filename, ext, expected_real_ext=None):
2512 name, real_ext = os.path.splitext(filename)
2513 return (
2514 f'{name}.{ext}{real_ext}'
2515 if not expected_real_ext or real_ext[1:] == expected_real_ext
2516 else f'{filename}.{ext}')
2517
2518
2519 def replace_extension(filename, ext, expected_real_ext=None):
2520 name, real_ext = os.path.splitext(filename)
2521 return '{}.{}'.format(
2522 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2523 ext)
2524
2525
2526 def check_executable(exe, args=[]):
2527 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2528 args can be a list of arguments for a short output (like -version) """
2529 try:
2530 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2531 except OSError:
2532 return False
2533 return exe
2534
2535
2536 def _get_exe_version_output(exe, args, *, to_screen=None):
2537 if to_screen:
2538 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2539 try:
2540 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2541 # SIGTTOU if yt-dlp is run in the background.
2542 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2543 out, _ = Popen(
2544 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2545 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2546 except OSError:
2547 return False
2548 if isinstance(out, bytes): # Python 2.x
2549 out = out.decode('ascii', 'ignore')
2550 return out
2551
2552
2553 def detect_exe_version(output, version_re=None, unrecognized='present'):
2554 assert isinstance(output, compat_str)
2555 if version_re is None:
2556 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2557 m = re.search(version_re, output)
2558 if m:
2559 return m.group(1)
2560 else:
2561 return unrecognized
2562
2563
2564 def get_exe_version(exe, args=['--version'],
2565 version_re=None, unrecognized='present'):
2566 """ Returns the version of the specified executable,
2567 or False if the executable is not present """
2568 out = _get_exe_version_output(exe, args)
2569 return detect_exe_version(out, version_re, unrecognized) if out else False
2570
2571
2572 class LazyList(collections.abc.Sequence):
2573 """Lazy immutable list from an iterable
2574 Note that slices of a LazyList are lists and not LazyList"""
2575
2576 class IndexError(IndexError):
2577 pass
2578
2579 def __init__(self, iterable, *, reverse=False, _cache=None):
2580 self._iterable = iter(iterable)
2581 self._cache = [] if _cache is None else _cache
2582 self._reversed = reverse
2583
2584 def __iter__(self):
2585 if self._reversed:
2586 # We need to consume the entire iterable to iterate in reverse
2587 yield from self.exhaust()
2588 return
2589 yield from self._cache
2590 for item in self._iterable:
2591 self._cache.append(item)
2592 yield item
2593
2594 def _exhaust(self):
2595 self._cache.extend(self._iterable)
2596 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2597 return self._cache
2598
2599 def exhaust(self):
2600 """Evaluate the entire iterable"""
2601 return self._exhaust()[::-1 if self._reversed else 1]
2602
2603 @staticmethod
2604 def _reverse_index(x):
2605 return None if x is None else -(x + 1)
2606
2607 def __getitem__(self, idx):
2608 if isinstance(idx, slice):
2609 if self._reversed:
2610 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2611 start, stop, step = idx.start, idx.stop, idx.step or 1
2612 elif isinstance(idx, int):
2613 if self._reversed:
2614 idx = self._reverse_index(idx)
2615 start, stop, step = idx, idx, 0
2616 else:
2617 raise TypeError('indices must be integers or slices')
2618 if ((start or 0) < 0 or (stop or 0) < 0
2619 or (start is None and step < 0)
2620 or (stop is None and step > 0)):
2621 # We need to consume the entire iterable to be able to slice from the end
2622 # Obviously, never use this with infinite iterables
2623 self._exhaust()
2624 try:
2625 return self._cache[idx]
2626 except IndexError as e:
2627 raise self.IndexError(e) from e
2628 n = max(start or 0, stop or 0) - len(self._cache) + 1
2629 if n > 0:
2630 self._cache.extend(itertools.islice(self._iterable, n))
2631 try:
2632 return self._cache[idx]
2633 except IndexError as e:
2634 raise self.IndexError(e) from e
2635
2636 def __bool__(self):
2637 try:
2638 self[-1] if self._reversed else self[0]
2639 except self.IndexError:
2640 return False
2641 return True
2642
2643 def __len__(self):
2644 self._exhaust()
2645 return len(self._cache)
2646
2647 def __reversed__(self):
2648 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2649
2650 def __copy__(self):
2651 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2652
2653 def __repr__(self):
2654 # repr and str should mimic a list. So we exhaust the iterable
2655 return repr(self.exhaust())
2656
2657 def __str__(self):
2658 return repr(self.exhaust())
2659
2660
2661 class PagedList:
2662
2663 class IndexError(IndexError):
2664 pass
2665
2666 def __len__(self):
2667 # This is only useful for tests
2668 return len(self.getslice())
2669
2670 def __init__(self, pagefunc, pagesize, use_cache=True):
2671 self._pagefunc = pagefunc
2672 self._pagesize = pagesize
2673 self._pagecount = float('inf')
2674 self._use_cache = use_cache
2675 self._cache = {}
2676
2677 def getpage(self, pagenum):
2678 page_results = self._cache.get(pagenum)
2679 if page_results is None:
2680 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2681 if self._use_cache:
2682 self._cache[pagenum] = page_results
2683 return page_results
2684
2685 def getslice(self, start=0, end=None):
2686 return list(self._getslice(start, end))
2687
2688 def _getslice(self, start, end):
2689 raise NotImplementedError('This method must be implemented by subclasses')
2690
2691 def __getitem__(self, idx):
2692 assert self._use_cache, 'Indexing PagedList requires cache'
2693 if not isinstance(idx, int) or idx < 0:
2694 raise TypeError('indices must be non-negative integers')
2695 entries = self.getslice(idx, idx + 1)
2696 if not entries:
2697 raise self.IndexError()
2698 return entries[0]
2699
2700
2701 class OnDemandPagedList(PagedList):
2702 """Download pages until a page with less than maximum results"""
2703
2704 def _getslice(self, start, end):
2705 for pagenum in itertools.count(start // self._pagesize):
2706 firstid = pagenum * self._pagesize
2707 nextfirstid = pagenum * self._pagesize + self._pagesize
2708 if start >= nextfirstid:
2709 continue
2710
2711 startv = (
2712 start % self._pagesize
2713 if firstid <= start < nextfirstid
2714 else 0)
2715 endv = (
2716 ((end - 1) % self._pagesize) + 1
2717 if (end is not None and firstid <= end <= nextfirstid)
2718 else None)
2719
2720 try:
2721 page_results = self.getpage(pagenum)
2722 except Exception:
2723 self._pagecount = pagenum - 1
2724 raise
2725 if startv != 0 or endv is not None:
2726 page_results = page_results[startv:endv]
2727 yield from page_results
2728
2729 # A little optimization - if current page is not "full", ie. does
2730 # not contain page_size videos then we can assume that this page
2731 # is the last one - there are no more ids on further pages -
2732 # i.e. no need to query again.
2733 if len(page_results) + startv < self._pagesize:
2734 break
2735
2736 # If we got the whole page, but the next page is not interesting,
2737 # break out early as well
2738 if end == nextfirstid:
2739 break
2740
2741
2742 class InAdvancePagedList(PagedList):
2743 """PagedList with total number of pages known in advance"""
2744
2745 def __init__(self, pagefunc, pagecount, pagesize):
2746 PagedList.__init__(self, pagefunc, pagesize, True)
2747 self._pagecount = pagecount
2748
2749 def _getslice(self, start, end):
2750 start_page = start // self._pagesize
2751 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2752 skip_elems = start - start_page * self._pagesize
2753 only_more = None if end is None else end - start
2754 for pagenum in range(start_page, end_page):
2755 page_results = self.getpage(pagenum)
2756 if skip_elems:
2757 page_results = page_results[skip_elems:]
2758 skip_elems = None
2759 if only_more is not None:
2760 if len(page_results) < only_more:
2761 only_more -= len(page_results)
2762 else:
2763 yield from page_results[:only_more]
2764 break
2765 yield from page_results
2766
2767
2768 def uppercase_escape(s):
2769 unicode_escape = codecs.getdecoder('unicode_escape')
2770 return re.sub(
2771 r'\\U[0-9a-fA-F]{8}',
2772 lambda m: unicode_escape(m.group(0))[0],
2773 s)
2774
2775
2776 def lowercase_escape(s):
2777 unicode_escape = codecs.getdecoder('unicode_escape')
2778 return re.sub(
2779 r'\\u[0-9a-fA-F]{4}',
2780 lambda m: unicode_escape(m.group(0))[0],
2781 s)
2782
2783
2784 def escape_rfc3986(s):
2785 """Escape non-ASCII characters as suggested by RFC 3986"""
2786 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2787
2788
2789 def escape_url(url):
2790 """Escape URL as suggested by RFC 3986"""
2791 url_parsed = compat_urllib_parse_urlparse(url)
2792 return url_parsed._replace(
2793 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2794 path=escape_rfc3986(url_parsed.path),
2795 params=escape_rfc3986(url_parsed.params),
2796 query=escape_rfc3986(url_parsed.query),
2797 fragment=escape_rfc3986(url_parsed.fragment)
2798 ).geturl()
2799
2800
2801 def parse_qs(url):
2802 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2803
2804
2805 def read_batch_urls(batch_fd):
2806 def fixup(url):
2807 if not isinstance(url, compat_str):
2808 url = url.decode('utf-8', 'replace')
2809 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2810 for bom in BOM_UTF8:
2811 if url.startswith(bom):
2812 url = url[len(bom):]
2813 url = url.lstrip()
2814 if not url or url.startswith(('#', ';', ']')):
2815 return False
2816 # "#" cannot be stripped out since it is part of the URI
2817 # However, it can be safely stipped out if follwing a whitespace
2818 return re.split(r'\s#', url, 1)[0].rstrip()
2819
2820 with contextlib.closing(batch_fd) as fd:
2821 return [url for url in map(fixup, fd) if url]
2822
2823
2824 def urlencode_postdata(*args, **kargs):
2825 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2826
2827
2828 def update_url_query(url, query):
2829 if not query:
2830 return url
2831 parsed_url = compat_urlparse.urlparse(url)
2832 qs = compat_parse_qs(parsed_url.query)
2833 qs.update(query)
2834 return compat_urlparse.urlunparse(parsed_url._replace(
2835 query=compat_urllib_parse_urlencode(qs, True)))
2836
2837
2838 def update_Request(req, url=None, data=None, headers={}, query={}):
2839 req_headers = req.headers.copy()
2840 req_headers.update(headers)
2841 req_data = data or req.data
2842 req_url = update_url_query(url or req.get_full_url(), query)
2843 req_get_method = req.get_method()
2844 if req_get_method == 'HEAD':
2845 req_type = HEADRequest
2846 elif req_get_method == 'PUT':
2847 req_type = PUTRequest
2848 else:
2849 req_type = compat_urllib_request.Request
2850 new_req = req_type(
2851 req_url, data=req_data, headers=req_headers,
2852 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2853 if hasattr(req, 'timeout'):
2854 new_req.timeout = req.timeout
2855 return new_req
2856
2857
2858 def _multipart_encode_impl(data, boundary):
2859 content_type = 'multipart/form-data; boundary=%s' % boundary
2860
2861 out = b''
2862 for k, v in data.items():
2863 out += b'--' + boundary.encode('ascii') + b'\r\n'
2864 if isinstance(k, compat_str):
2865 k = k.encode()
2866 if isinstance(v, compat_str):
2867 v = v.encode()
2868 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2869 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2870 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2871 if boundary.encode('ascii') in content:
2872 raise ValueError('Boundary overlaps with data')
2873 out += content
2874
2875 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2876
2877 return out, content_type
2878
2879
2880 def multipart_encode(data, boundary=None):
2881 '''
2882 Encode a dict to RFC 7578-compliant form-data
2883
2884 data:
2885 A dict where keys and values can be either Unicode or bytes-like
2886 objects.
2887 boundary:
2888 If specified a Unicode object, it's used as the boundary. Otherwise
2889 a random boundary is generated.
2890
2891 Reference: https://tools.ietf.org/html/rfc7578
2892 '''
2893 has_specified_boundary = boundary is not None
2894
2895 while True:
2896 if boundary is None:
2897 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2898
2899 try:
2900 out, content_type = _multipart_encode_impl(data, boundary)
2901 break
2902 except ValueError:
2903 if has_specified_boundary:
2904 raise
2905 boundary = None
2906
2907 return out, content_type
2908
2909
2910 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2911 for val in map(d.get, variadic(key_or_keys)):
2912 if val is not None and (val or not skip_false_values):
2913 return val
2914 return default
2915
2916
2917 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2918 for f in funcs:
2919 try:
2920 val = f(*args, **kwargs)
2921 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2922 pass
2923 else:
2924 if expected_type is None or isinstance(val, expected_type):
2925 return val
2926
2927
2928 def try_get(src, getter, expected_type=None):
2929 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2930
2931
2932 def filter_dict(dct, cndn=lambda _, v: v is not None):
2933 return {k: v for k, v in dct.items() if cndn(k, v)}
2934
2935
2936 def merge_dicts(*dicts):
2937 merged = {}
2938 for a_dict in dicts:
2939 for k, v in a_dict.items():
2940 if (v is not None and k not in merged
2941 or isinstance(v, str) and merged[k] == ''):
2942 merged[k] = v
2943 return merged
2944
2945
2946 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2947 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2948
2949
2950 US_RATINGS = {
2951 'G': 0,
2952 'PG': 10,
2953 'PG-13': 13,
2954 'R': 16,
2955 'NC': 18,
2956 }
2957
2958
2959 TV_PARENTAL_GUIDELINES = {
2960 'TV-Y': 0,
2961 'TV-Y7': 7,
2962 'TV-G': 0,
2963 'TV-PG': 0,
2964 'TV-14': 14,
2965 'TV-MA': 17,
2966 }
2967
2968
2969 def parse_age_limit(s):
2970 # isinstance(False, int) is True. So type() must be used instead
2971 if type(s) is int:
2972 return s if 0 <= s <= 21 else None
2973 elif not isinstance(s, str):
2974 return None
2975 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2976 if m:
2977 return int(m.group('age'))
2978 s = s.upper()
2979 if s in US_RATINGS:
2980 return US_RATINGS[s]
2981 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2982 if m:
2983 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2984 return None
2985
2986
2987 def strip_jsonp(code):
2988 return re.sub(
2989 r'''(?sx)^
2990 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2991 (?:\s*&&\s*(?P=func_name))?
2992 \s*\(\s*(?P<callback_data>.*)\);?
2993 \s*?(?://[^\n]*)*$''',
2994 r'\g<callback_data>', code)
2995
2996
2997 def js_to_json(code, vars={}):
2998 # vars is a dict of var, val pairs to substitute
2999 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3000 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3001 INTEGER_TABLE = (
3002 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3003 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3004 )
3005
3006 def fix_kv(m):
3007 v = m.group(0)
3008 if v in ('true', 'false', 'null'):
3009 return v
3010 elif v in ('undefined', 'void 0'):
3011 return 'null'
3012 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3013 return ""
3014
3015 if v[0] in ("'", '"'):
3016 v = re.sub(r'(?s)\\.|"', lambda m: {
3017 '"': '\\"',
3018 "\\'": "'",
3019 '\\\n': '',
3020 '\\x': '\\u00',
3021 }.get(m.group(0), m.group(0)), v[1:-1])
3022 else:
3023 for regex, base in INTEGER_TABLE:
3024 im = re.match(regex, v)
3025 if im:
3026 i = int(im.group(1), base)
3027 return '"%d":' % i if v.endswith(':') else '%d' % i
3028
3029 if v in vars:
3030 return vars[v]
3031
3032 return '"%s"' % v
3033
3034 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3035
3036 return re.sub(r'''(?sx)
3037 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3038 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3039 {comment}|,(?={skip}[\]}}])|
3040 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3041 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3042 [0-9]+(?={skip}:)|
3043 !+
3044 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3045
3046
3047 def qualities(quality_ids):
3048 """ Get a numeric quality value out of a list of possible values """
3049 def q(qid):
3050 try:
3051 return quality_ids.index(qid)
3052 except ValueError:
3053 return -1
3054 return q
3055
3056
3057 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3058
3059
3060 DEFAULT_OUTTMPL = {
3061 'default': '%(title)s [%(id)s].%(ext)s',
3062 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3063 }
3064 OUTTMPL_TYPES = {
3065 'chapter': None,
3066 'subtitle': None,
3067 'thumbnail': None,
3068 'description': 'description',
3069 'annotation': 'annotations.xml',
3070 'infojson': 'info.json',
3071 'link': None,
3072 'pl_video': None,
3073 'pl_thumbnail': None,
3074 'pl_description': 'description',
3075 'pl_infojson': 'info.json',
3076 }
3077
3078 # As of [1] format syntax is:
3079 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3080 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3081 STR_FORMAT_RE_TMPL = r'''(?x)
3082 (?<!%)(?P<prefix>(?:%%)*)
3083 %
3084 (?P<has_key>\((?P<key>{0})\))?
3085 (?P<format>
3086 (?P<conversion>[#0\-+ ]+)?
3087 (?P<min_width>\d+)?
3088 (?P<precision>\.\d+)?
3089 (?P<len_mod>[hlL])? # unused in python
3090 {1} # conversion type
3091 )
3092 '''
3093
3094
3095 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3096
3097
3098 def limit_length(s, length):
3099 """ Add ellipses to overly long strings """
3100 if s is None:
3101 return None
3102 ELLIPSES = '...'
3103 if len(s) > length:
3104 return s[:length - len(ELLIPSES)] + ELLIPSES
3105 return s
3106
3107
3108 def version_tuple(v):
3109 return tuple(int(e) for e in re.split(r'[-.]', v))
3110
3111
3112 def is_outdated_version(version, limit, assume_new=True):
3113 if not version:
3114 return not assume_new
3115 try:
3116 return version_tuple(version) < version_tuple(limit)
3117 except ValueError:
3118 return not assume_new
3119
3120
3121 def ytdl_is_updateable():
3122 """ Returns if yt-dlp can be updated with -U """
3123
3124 from .update import is_non_updateable
3125
3126 return not is_non_updateable()
3127
3128
3129 def args_to_str(args):
3130 # Get a short string representation for a subprocess command
3131 return ' '.join(compat_shlex_quote(a) for a in args)
3132
3133
3134 def error_to_compat_str(err):
3135 return str(err)
3136
3137
3138 def error_to_str(err):
3139 return f'{type(err).__name__}: {err}'
3140
3141
3142 def mimetype2ext(mt):
3143 if mt is None:
3144 return None
3145
3146 mt, _, params = mt.partition(';')
3147 mt = mt.strip()
3148
3149 FULL_MAP = {
3150 'audio/mp4': 'm4a',
3151 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3152 # it's the most popular one
3153 'audio/mpeg': 'mp3',
3154 'audio/x-wav': 'wav',
3155 'audio/wav': 'wav',
3156 'audio/wave': 'wav',
3157 }
3158
3159 ext = FULL_MAP.get(mt)
3160 if ext is not None:
3161 return ext
3162
3163 SUBTYPE_MAP = {
3164 '3gpp': '3gp',
3165 'smptett+xml': 'tt',
3166 'ttaf+xml': 'dfxp',
3167 'ttml+xml': 'ttml',
3168 'x-flv': 'flv',
3169 'x-mp4-fragmented': 'mp4',
3170 'x-ms-sami': 'sami',
3171 'x-ms-wmv': 'wmv',
3172 'mpegurl': 'm3u8',
3173 'x-mpegurl': 'm3u8',
3174 'vnd.apple.mpegurl': 'm3u8',
3175 'dash+xml': 'mpd',
3176 'f4m+xml': 'f4m',
3177 'hds+xml': 'f4m',
3178 'vnd.ms-sstr+xml': 'ism',
3179 'quicktime': 'mov',
3180 'mp2t': 'ts',
3181 'x-wav': 'wav',
3182 'filmstrip+json': 'fs',
3183 'svg+xml': 'svg',
3184 }
3185
3186 _, _, subtype = mt.rpartition('/')
3187 ext = SUBTYPE_MAP.get(subtype.lower())
3188 if ext is not None:
3189 return ext
3190
3191 SUFFIX_MAP = {
3192 'json': 'json',
3193 'xml': 'xml',
3194 'zip': 'zip',
3195 'gzip': 'gz',
3196 }
3197
3198 _, _, suffix = subtype.partition('+')
3199 ext = SUFFIX_MAP.get(suffix)
3200 if ext is not None:
3201 return ext
3202
3203 return subtype.replace('+', '.')
3204
3205
3206 def ext2mimetype(ext_or_url):
3207 if not ext_or_url:
3208 return None
3209 if '.' not in ext_or_url:
3210 ext_or_url = f'file.{ext_or_url}'
3211 return mimetypes.guess_type(ext_or_url)[0]
3212
3213
3214 def parse_codecs(codecs_str):
3215 # http://tools.ietf.org/html/rfc6381
3216 if not codecs_str:
3217 return {}
3218 split_codecs = list(filter(None, map(
3219 str.strip, codecs_str.strip().strip(',').split(','))))
3220 vcodec, acodec, scodec, hdr = None, None, None, None
3221 for full_codec in split_codecs:
3222 parts = full_codec.split('.')
3223 codec = parts[0].replace('0', '')
3224 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3225 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3226 if not vcodec:
3227 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3228 if codec in ('dvh1', 'dvhe'):
3229 hdr = 'DV'
3230 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3231 hdr = 'HDR10'
3232 elif full_codec.replace('0', '').startswith('vp9.2'):
3233 hdr = 'HDR10'
3234 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3235 if not acodec:
3236 acodec = full_codec
3237 elif codec in ('stpp', 'wvtt',):
3238 if not scodec:
3239 scodec = full_codec
3240 else:
3241 write_string(f'WARNING: Unknown codec {full_codec}\n')
3242 if vcodec or acodec or scodec:
3243 return {
3244 'vcodec': vcodec or 'none',
3245 'acodec': acodec or 'none',
3246 'dynamic_range': hdr,
3247 **({'scodec': scodec} if scodec is not None else {}),
3248 }
3249 elif len(split_codecs) == 2:
3250 return {
3251 'vcodec': split_codecs[0],
3252 'acodec': split_codecs[1],
3253 }
3254 return {}
3255
3256
3257 def urlhandle_detect_ext(url_handle):
3258 getheader = url_handle.headers.get
3259
3260 cd = getheader('Content-Disposition')
3261 if cd:
3262 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3263 if m:
3264 e = determine_ext(m.group('filename'), default_ext=None)
3265 if e:
3266 return e
3267
3268 return mimetype2ext(getheader('Content-Type'))
3269
3270
3271 def encode_data_uri(data, mime_type):
3272 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3273
3274
3275 def age_restricted(content_limit, age_limit):
3276 """ Returns True iff the content should be blocked """
3277
3278 if age_limit is None: # No limit set
3279 return False
3280 if content_limit is None:
3281 return False # Content available for everyone
3282 return age_limit < content_limit
3283
3284
3285 def is_html(first_bytes):
3286 """ Detect whether a file contains HTML by examining its first bytes. """
3287
3288 BOMS = [
3289 (b'\xef\xbb\xbf', 'utf-8'),
3290 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3291 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3292 (b'\xff\xfe', 'utf-16-le'),
3293 (b'\xfe\xff', 'utf-16-be'),
3294 ]
3295
3296 encoding = 'utf-8'
3297 for bom, enc in BOMS:
3298 while first_bytes.startswith(bom):
3299 encoding, first_bytes = enc, first_bytes[len(bom):]
3300
3301 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3302
3303
3304 def determine_protocol(info_dict):
3305 protocol = info_dict.get('protocol')
3306 if protocol is not None:
3307 return protocol
3308
3309 url = sanitize_url(info_dict['url'])
3310 if url.startswith('rtmp'):
3311 return 'rtmp'
3312 elif url.startswith('mms'):
3313 return 'mms'
3314 elif url.startswith('rtsp'):
3315 return 'rtsp'
3316
3317 ext = determine_ext(url)
3318 if ext == 'm3u8':
3319 return 'm3u8'
3320 elif ext == 'f4m':
3321 return 'f4m'
3322
3323 return compat_urllib_parse_urlparse(url).scheme
3324
3325
3326 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3327 """ Render a list of rows, each as a list of values.
3328 Text after a \t will be right aligned """
3329 def width(string):
3330 return len(remove_terminal_sequences(string).replace('\t', ''))
3331
3332 def get_max_lens(table):
3333 return [max(width(str(v)) for v in col) for col in zip(*table)]
3334
3335 def filter_using_list(row, filterArray):
3336 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3337
3338 max_lens = get_max_lens(data) if hide_empty else []
3339 header_row = filter_using_list(header_row, max_lens)
3340 data = [filter_using_list(row, max_lens) for row in data]
3341
3342 table = [header_row] + data
3343 max_lens = get_max_lens(table)
3344 extra_gap += 1
3345 if delim:
3346 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3347 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3348 for row in table:
3349 for pos, text in enumerate(map(str, row)):
3350 if '\t' in text:
3351 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3352 else:
3353 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3354 ret = '\n'.join(''.join(row).rstrip() for row in table)
3355 return ret
3356
3357
3358 def _match_one(filter_part, dct, incomplete):
3359 # TODO: Generalize code with YoutubeDL._build_format_filter
3360 STRING_OPERATORS = {
3361 '*=': operator.contains,
3362 '^=': lambda attr, value: attr.startswith(value),
3363 '$=': lambda attr, value: attr.endswith(value),
3364 '~=': lambda attr, value: re.search(value, attr),
3365 }
3366 COMPARISON_OPERATORS = {
3367 **STRING_OPERATORS,
3368 '<=': operator.le, # "<=" must be defined above "<"
3369 '<': operator.lt,
3370 '>=': operator.ge,
3371 '>': operator.gt,
3372 '=': operator.eq,
3373 }
3374
3375 if isinstance(incomplete, bool):
3376 is_incomplete = lambda _: incomplete
3377 else:
3378 is_incomplete = lambda k: k in incomplete
3379
3380 operator_rex = re.compile(r'''(?x)\s*
3381 (?P<key>[a-z_]+)
3382 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3383 (?:
3384 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3385 (?P<strval>.+?)
3386 )
3387 \s*$
3388 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3389 m = operator_rex.search(filter_part)
3390 if m:
3391 m = m.groupdict()
3392 unnegated_op = COMPARISON_OPERATORS[m['op']]
3393 if m['negation']:
3394 op = lambda attr, value: not unnegated_op(attr, value)
3395 else:
3396 op = unnegated_op
3397 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3398 if m['quote']:
3399 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3400 actual_value = dct.get(m['key'])
3401 numeric_comparison = None
3402 if isinstance(actual_value, (int, float)):
3403 # If the original field is a string and matching comparisonvalue is
3404 # a number we should respect the origin of the original field
3405 # and process comparison value as a string (see
3406 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3407 try:
3408 numeric_comparison = int(comparison_value)
3409 except ValueError:
3410 numeric_comparison = parse_filesize(comparison_value)
3411 if numeric_comparison is None:
3412 numeric_comparison = parse_filesize(f'{comparison_value}B')
3413 if numeric_comparison is None:
3414 numeric_comparison = parse_duration(comparison_value)
3415 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3416 raise ValueError('Operator %s only supports string values!' % m['op'])
3417 if actual_value is None:
3418 return is_incomplete(m['key']) or m['none_inclusive']
3419 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3420
3421 UNARY_OPERATORS = {
3422 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3423 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3424 }
3425 operator_rex = re.compile(r'''(?x)\s*
3426 (?P<op>%s)\s*(?P<key>[a-z_]+)
3427 \s*$
3428 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3429 m = operator_rex.search(filter_part)
3430 if m:
3431 op = UNARY_OPERATORS[m.group('op')]
3432 actual_value = dct.get(m.group('key'))
3433 if is_incomplete(m.group('key')) and actual_value is None:
3434 return True
3435 return op(actual_value)
3436
3437 raise ValueError('Invalid filter part %r' % filter_part)
3438
3439
3440 def match_str(filter_str, dct, incomplete=False):
3441 """ Filter a dictionary with a simple string syntax.
3442 @returns Whether the filter passes
3443 @param incomplete Set of keys that is expected to be missing from dct.
3444 Can be True/False to indicate all/none of the keys may be missing.
3445 All conditions on incomplete keys pass if the key is missing
3446 """
3447 return all(
3448 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3449 for filter_part in re.split(r'(?<!\\)&', filter_str))
3450
3451
3452 def match_filter_func(filters):
3453 if not filters:
3454 return None
3455 filters = set(variadic(filters))
3456
3457 interactive = '-' in filters
3458 if interactive:
3459 filters.remove('-')
3460
3461 def _match_func(info_dict, incomplete=False):
3462 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3463 return NO_DEFAULT if interactive and not incomplete else None
3464 else:
3465 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3466 filter_str = ') | ('.join(map(str.strip, filters))
3467 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3468 return _match_func
3469
3470
3471 def parse_dfxp_time_expr(time_expr):
3472 if not time_expr:
3473 return
3474
3475 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3476 if mobj:
3477 return float(mobj.group('time_offset'))
3478
3479 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3480 if mobj:
3481 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3482
3483
3484 def srt_subtitles_timecode(seconds):
3485 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3486
3487
3488 def ass_subtitles_timecode(seconds):
3489 time = timetuple_from_msec(seconds * 1000)
3490 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3491
3492
3493 def dfxp2srt(dfxp_data):
3494 '''
3495 @param dfxp_data A bytes-like object containing DFXP data
3496 @returns A unicode object containing converted SRT data
3497 '''
3498 LEGACY_NAMESPACES = (
3499 (b'http://www.w3.org/ns/ttml', [
3500 b'http://www.w3.org/2004/11/ttaf1',
3501 b'http://www.w3.org/2006/04/ttaf1',
3502 b'http://www.w3.org/2006/10/ttaf1',
3503 ]),
3504 (b'http://www.w3.org/ns/ttml#styling', [
3505 b'http://www.w3.org/ns/ttml#style',
3506 ]),
3507 )
3508
3509 SUPPORTED_STYLING = [
3510 'color',
3511 'fontFamily',
3512 'fontSize',
3513 'fontStyle',
3514 'fontWeight',
3515 'textDecoration'
3516 ]
3517
3518 _x = functools.partial(xpath_with_ns, ns_map={
3519 'xml': 'http://www.w3.org/XML/1998/namespace',
3520 'ttml': 'http://www.w3.org/ns/ttml',
3521 'tts': 'http://www.w3.org/ns/ttml#styling',
3522 })
3523
3524 styles = {}
3525 default_style = {}
3526
3527 class TTMLPElementParser:
3528 _out = ''
3529 _unclosed_elements = []
3530 _applied_styles = []
3531
3532 def start(self, tag, attrib):
3533 if tag in (_x('ttml:br'), 'br'):
3534 self._out += '\n'
3535 else:
3536 unclosed_elements = []
3537 style = {}
3538 element_style_id = attrib.get('style')
3539 if default_style:
3540 style.update(default_style)
3541 if element_style_id:
3542 style.update(styles.get(element_style_id, {}))
3543 for prop in SUPPORTED_STYLING:
3544 prop_val = attrib.get(_x('tts:' + prop))
3545 if prop_val:
3546 style[prop] = prop_val
3547 if style:
3548 font = ''
3549 for k, v in sorted(style.items()):
3550 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3551 continue
3552 if k == 'color':
3553 font += ' color="%s"' % v
3554 elif k == 'fontSize':
3555 font += ' size="%s"' % v
3556 elif k == 'fontFamily':
3557 font += ' face="%s"' % v
3558 elif k == 'fontWeight' and v == 'bold':
3559 self._out += '<b>'
3560 unclosed_elements.append('b')
3561 elif k == 'fontStyle' and v == 'italic':
3562 self._out += '<i>'
3563 unclosed_elements.append('i')
3564 elif k == 'textDecoration' and v == 'underline':
3565 self._out += '<u>'
3566 unclosed_elements.append('u')
3567 if font:
3568 self._out += '<font' + font + '>'
3569 unclosed_elements.append('font')
3570 applied_style = {}
3571 if self._applied_styles:
3572 applied_style.update(self._applied_styles[-1])
3573 applied_style.update(style)
3574 self._applied_styles.append(applied_style)
3575 self._unclosed_elements.append(unclosed_elements)
3576
3577 def end(self, tag):
3578 if tag not in (_x('ttml:br'), 'br'):
3579 unclosed_elements = self._unclosed_elements.pop()
3580 for element in reversed(unclosed_elements):
3581 self._out += '</%s>' % element
3582 if unclosed_elements and self._applied_styles:
3583 self._applied_styles.pop()
3584
3585 def data(self, data):
3586 self._out += data
3587
3588 def close(self):
3589 return self._out.strip()
3590
3591 def parse_node(node):
3592 target = TTMLPElementParser()
3593 parser = xml.etree.ElementTree.XMLParser(target=target)
3594 parser.feed(xml.etree.ElementTree.tostring(node))
3595 return parser.close()
3596
3597 for k, v in LEGACY_NAMESPACES:
3598 for ns in v:
3599 dfxp_data = dfxp_data.replace(ns, k)
3600
3601 dfxp = compat_etree_fromstring(dfxp_data)
3602 out = []
3603 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3604
3605 if not paras:
3606 raise ValueError('Invalid dfxp/TTML subtitle')
3607
3608 repeat = False
3609 while True:
3610 for style in dfxp.findall(_x('.//ttml:style')):
3611 style_id = style.get('id') or style.get(_x('xml:id'))
3612 if not style_id:
3613 continue
3614 parent_style_id = style.get('style')
3615 if parent_style_id:
3616 if parent_style_id not in styles:
3617 repeat = True
3618 continue
3619 styles[style_id] = styles[parent_style_id].copy()
3620 for prop in SUPPORTED_STYLING:
3621 prop_val = style.get(_x('tts:' + prop))
3622 if prop_val:
3623 styles.setdefault(style_id, {})[prop] = prop_val
3624 if repeat:
3625 repeat = False
3626 else:
3627 break
3628
3629 for p in ('body', 'div'):
3630 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3631 if ele is None:
3632 continue
3633 style = styles.get(ele.get('style'))
3634 if not style:
3635 continue
3636 default_style.update(style)
3637
3638 for para, index in zip(paras, itertools.count(1)):
3639 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3640 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3641 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3642 if begin_time is None:
3643 continue
3644 if not end_time:
3645 if not dur:
3646 continue
3647 end_time = begin_time + dur
3648 out.append('%d\n%s --> %s\n%s\n\n' % (
3649 index,
3650 srt_subtitles_timecode(begin_time),
3651 srt_subtitles_timecode(end_time),
3652 parse_node(para)))
3653
3654 return ''.join(out)
3655
3656
3657 def cli_option(params, command_option, param):
3658 param = params.get(param)
3659 if param:
3660 param = compat_str(param)
3661 return [command_option, param] if param is not None else []
3662
3663
3664 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3665 param = params.get(param)
3666 if param is None:
3667 return []
3668 assert isinstance(param, bool)
3669 if separator:
3670 return [command_option + separator + (true_value if param else false_value)]
3671 return [command_option, true_value if param else false_value]
3672
3673
3674 def cli_valueless_option(params, command_option, param, expected_value=True):
3675 param = params.get(param)
3676 return [command_option] if param == expected_value else []
3677
3678
3679 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3680 if isinstance(argdict, (list, tuple)): # for backward compatibility
3681 if use_compat:
3682 return argdict
3683 else:
3684 argdict = None
3685 if argdict is None:
3686 return default
3687 assert isinstance(argdict, dict)
3688
3689 assert isinstance(keys, (list, tuple))
3690 for key_list in keys:
3691 arg_list = list(filter(
3692 lambda x: x is not None,
3693 [argdict.get(key.lower()) for key in variadic(key_list)]))
3694 if arg_list:
3695 return [arg for args in arg_list for arg in args]
3696 return default
3697
3698
3699 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3700 main_key, exe = main_key.lower(), exe.lower()
3701 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3702 keys = [f'{root_key}{k}' for k in (keys or [''])]
3703 if root_key in keys:
3704 if main_key != exe:
3705 keys.append((main_key, exe))
3706 keys.append('default')
3707 else:
3708 use_compat = False
3709 return cli_configuration_args(argdict, keys, default, use_compat)
3710
3711
3712 class ISO639Utils:
3713 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3714 _lang_map = {
3715 'aa': 'aar',
3716 'ab': 'abk',
3717 'ae': 'ave',
3718 'af': 'afr',
3719 'ak': 'aka',
3720 'am': 'amh',
3721 'an': 'arg',
3722 'ar': 'ara',
3723 'as': 'asm',
3724 'av': 'ava',
3725 'ay': 'aym',
3726 'az': 'aze',
3727 'ba': 'bak',
3728 'be': 'bel',
3729 'bg': 'bul',
3730 'bh': 'bih',
3731 'bi': 'bis',
3732 'bm': 'bam',
3733 'bn': 'ben',
3734 'bo': 'bod',
3735 'br': 'bre',
3736 'bs': 'bos',
3737 'ca': 'cat',
3738 'ce': 'che',
3739 'ch': 'cha',
3740 'co': 'cos',
3741 'cr': 'cre',
3742 'cs': 'ces',
3743 'cu': 'chu',
3744 'cv': 'chv',
3745 'cy': 'cym',
3746 'da': 'dan',
3747 'de': 'deu',
3748 'dv': 'div',
3749 'dz': 'dzo',
3750 'ee': 'ewe',
3751 'el': 'ell',
3752 'en': 'eng',
3753 'eo': 'epo',
3754 'es': 'spa',
3755 'et': 'est',
3756 'eu': 'eus',
3757 'fa': 'fas',
3758 'ff': 'ful',
3759 'fi': 'fin',
3760 'fj': 'fij',
3761 'fo': 'fao',
3762 'fr': 'fra',
3763 'fy': 'fry',
3764 'ga': 'gle',
3765 'gd': 'gla',
3766 'gl': 'glg',
3767 'gn': 'grn',
3768 'gu': 'guj',
3769 'gv': 'glv',
3770 'ha': 'hau',
3771 'he': 'heb',
3772 'iw': 'heb', # Replaced by he in 1989 revision
3773 'hi': 'hin',
3774 'ho': 'hmo',
3775 'hr': 'hrv',
3776 'ht': 'hat',
3777 'hu': 'hun',
3778 'hy': 'hye',
3779 'hz': 'her',
3780 'ia': 'ina',
3781 'id': 'ind',
3782 'in': 'ind', # Replaced by id in 1989 revision
3783 'ie': 'ile',
3784 'ig': 'ibo',
3785 'ii': 'iii',
3786 'ik': 'ipk',
3787 'io': 'ido',
3788 'is': 'isl',
3789 'it': 'ita',
3790 'iu': 'iku',
3791 'ja': 'jpn',
3792 'jv': 'jav',
3793 'ka': 'kat',
3794 'kg': 'kon',
3795 'ki': 'kik',
3796 'kj': 'kua',
3797 'kk': 'kaz',
3798 'kl': 'kal',
3799 'km': 'khm',
3800 'kn': 'kan',
3801 'ko': 'kor',
3802 'kr': 'kau',
3803 'ks': 'kas',
3804 'ku': 'kur',
3805 'kv': 'kom',
3806 'kw': 'cor',
3807 'ky': 'kir',
3808 'la': 'lat',
3809 'lb': 'ltz',
3810 'lg': 'lug',
3811 'li': 'lim',
3812 'ln': 'lin',
3813 'lo': 'lao',
3814 'lt': 'lit',
3815 'lu': 'lub',
3816 'lv': 'lav',
3817 'mg': 'mlg',
3818 'mh': 'mah',
3819 'mi': 'mri',
3820 'mk': 'mkd',
3821 'ml': 'mal',
3822 'mn': 'mon',
3823 'mr': 'mar',
3824 'ms': 'msa',
3825 'mt': 'mlt',
3826 'my': 'mya',
3827 'na': 'nau',
3828 'nb': 'nob',
3829 'nd': 'nde',
3830 'ne': 'nep',
3831 'ng': 'ndo',
3832 'nl': 'nld',
3833 'nn': 'nno',
3834 'no': 'nor',
3835 'nr': 'nbl',
3836 'nv': 'nav',
3837 'ny': 'nya',
3838 'oc': 'oci',
3839 'oj': 'oji',
3840 'om': 'orm',
3841 'or': 'ori',
3842 'os': 'oss',
3843 'pa': 'pan',
3844 'pi': 'pli',
3845 'pl': 'pol',
3846 'ps': 'pus',
3847 'pt': 'por',
3848 'qu': 'que',
3849 'rm': 'roh',
3850 'rn': 'run',
3851 'ro': 'ron',
3852 'ru': 'rus',
3853 'rw': 'kin',
3854 'sa': 'san',
3855 'sc': 'srd',
3856 'sd': 'snd',
3857 'se': 'sme',
3858 'sg': 'sag',
3859 'si': 'sin',
3860 'sk': 'slk',
3861 'sl': 'slv',
3862 'sm': 'smo',
3863 'sn': 'sna',
3864 'so': 'som',
3865 'sq': 'sqi',
3866 'sr': 'srp',
3867 'ss': 'ssw',
3868 'st': 'sot',
3869 'su': 'sun',
3870 'sv': 'swe',
3871 'sw': 'swa',
3872 'ta': 'tam',
3873 'te': 'tel',
3874 'tg': 'tgk',
3875 'th': 'tha',
3876 'ti': 'tir',
3877 'tk': 'tuk',
3878 'tl': 'tgl',
3879 'tn': 'tsn',
3880 'to': 'ton',
3881 'tr': 'tur',
3882 'ts': 'tso',
3883 'tt': 'tat',
3884 'tw': 'twi',
3885 'ty': 'tah',
3886 'ug': 'uig',
3887 'uk': 'ukr',
3888 'ur': 'urd',
3889 'uz': 'uzb',
3890 've': 'ven',
3891 'vi': 'vie',
3892 'vo': 'vol',
3893 'wa': 'wln',
3894 'wo': 'wol',
3895 'xh': 'xho',
3896 'yi': 'yid',
3897 'ji': 'yid', # Replaced by yi in 1989 revision
3898 'yo': 'yor',
3899 'za': 'zha',
3900 'zh': 'zho',
3901 'zu': 'zul',
3902 }
3903
3904 @classmethod
3905 def short2long(cls, code):
3906 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3907 return cls._lang_map.get(code[:2])
3908
3909 @classmethod
3910 def long2short(cls, code):
3911 """Convert language code from ISO 639-2/T to ISO 639-1"""
3912 for short_name, long_name in cls._lang_map.items():
3913 if long_name == code:
3914 return short_name
3915
3916
3917 class ISO3166Utils:
3918 # From http://data.okfn.org/data/core/country-list
3919 _country_map = {
3920 'AF': 'Afghanistan',
3921 'AX': 'Åland Islands',
3922 'AL': 'Albania',
3923 'DZ': 'Algeria',
3924 'AS': 'American Samoa',
3925 'AD': 'Andorra',
3926 'AO': 'Angola',
3927 'AI': 'Anguilla',
3928 'AQ': 'Antarctica',
3929 'AG': 'Antigua and Barbuda',
3930 'AR': 'Argentina',
3931 'AM': 'Armenia',
3932 'AW': 'Aruba',
3933 'AU': 'Australia',
3934 'AT': 'Austria',
3935 'AZ': 'Azerbaijan',
3936 'BS': 'Bahamas',
3937 'BH': 'Bahrain',
3938 'BD': 'Bangladesh',
3939 'BB': 'Barbados',
3940 'BY': 'Belarus',
3941 'BE': 'Belgium',
3942 'BZ': 'Belize',
3943 'BJ': 'Benin',
3944 'BM': 'Bermuda',
3945 'BT': 'Bhutan',
3946 'BO': 'Bolivia, Plurinational State of',
3947 'BQ': 'Bonaire, Sint Eustatius and Saba',
3948 'BA': 'Bosnia and Herzegovina',
3949 'BW': 'Botswana',
3950 'BV': 'Bouvet Island',
3951 'BR': 'Brazil',
3952 'IO': 'British Indian Ocean Territory',
3953 'BN': 'Brunei Darussalam',
3954 'BG': 'Bulgaria',
3955 'BF': 'Burkina Faso',
3956 'BI': 'Burundi',
3957 'KH': 'Cambodia',
3958 'CM': 'Cameroon',
3959 'CA': 'Canada',
3960 'CV': 'Cape Verde',
3961 'KY': 'Cayman Islands',
3962 'CF': 'Central African Republic',
3963 'TD': 'Chad',
3964 'CL': 'Chile',
3965 'CN': 'China',
3966 'CX': 'Christmas Island',
3967 'CC': 'Cocos (Keeling) Islands',
3968 'CO': 'Colombia',
3969 'KM': 'Comoros',
3970 'CG': 'Congo',
3971 'CD': 'Congo, the Democratic Republic of the',
3972 'CK': 'Cook Islands',
3973 'CR': 'Costa Rica',
3974 'CI': 'Côte d\'Ivoire',
3975 'HR': 'Croatia',
3976 'CU': 'Cuba',
3977 'CW': 'Curaçao',
3978 'CY': 'Cyprus',
3979 'CZ': 'Czech Republic',
3980 'DK': 'Denmark',
3981 'DJ': 'Djibouti',
3982 'DM': 'Dominica',
3983 'DO': 'Dominican Republic',
3984 'EC': 'Ecuador',
3985 'EG': 'Egypt',
3986 'SV': 'El Salvador',
3987 'GQ': 'Equatorial Guinea',
3988 'ER': 'Eritrea',
3989 'EE': 'Estonia',
3990 'ET': 'Ethiopia',
3991 'FK': 'Falkland Islands (Malvinas)',
3992 'FO': 'Faroe Islands',
3993 'FJ': 'Fiji',
3994 'FI': 'Finland',
3995 'FR': 'France',
3996 'GF': 'French Guiana',
3997 'PF': 'French Polynesia',
3998 'TF': 'French Southern Territories',
3999 'GA': 'Gabon',
4000 'GM': 'Gambia',
4001 'GE': 'Georgia',
4002 'DE': 'Germany',
4003 'GH': 'Ghana',
4004 'GI': 'Gibraltar',
4005 'GR': 'Greece',
4006 'GL': 'Greenland',
4007 'GD': 'Grenada',
4008 'GP': 'Guadeloupe',
4009 'GU': 'Guam',
4010 'GT': 'Guatemala',
4011 'GG': 'Guernsey',
4012 'GN': 'Guinea',
4013 'GW': 'Guinea-Bissau',
4014 'GY': 'Guyana',
4015 'HT': 'Haiti',
4016 'HM': 'Heard Island and McDonald Islands',
4017 'VA': 'Holy See (Vatican City State)',
4018 'HN': 'Honduras',
4019 'HK': 'Hong Kong',
4020 'HU': 'Hungary',
4021 'IS': 'Iceland',
4022 'IN': 'India',
4023 'ID': 'Indonesia',
4024 'IR': 'Iran, Islamic Republic of',
4025 'IQ': 'Iraq',
4026 'IE': 'Ireland',
4027 'IM': 'Isle of Man',
4028 'IL': 'Israel',
4029 'IT': 'Italy',
4030 'JM': 'Jamaica',
4031 'JP': 'Japan',
4032 'JE': 'Jersey',
4033 'JO': 'Jordan',
4034 'KZ': 'Kazakhstan',
4035 'KE': 'Kenya',
4036 'KI': 'Kiribati',
4037 'KP': 'Korea, Democratic People\'s Republic of',
4038 'KR': 'Korea, Republic of',
4039 'KW': 'Kuwait',
4040 'KG': 'Kyrgyzstan',
4041 'LA': 'Lao People\'s Democratic Republic',
4042 'LV': 'Latvia',
4043 'LB': 'Lebanon',
4044 'LS': 'Lesotho',
4045 'LR': 'Liberia',
4046 'LY': 'Libya',
4047 'LI': 'Liechtenstein',
4048 'LT': 'Lithuania',
4049 'LU': 'Luxembourg',
4050 'MO': 'Macao',
4051 'MK': 'Macedonia, the Former Yugoslav Republic of',
4052 'MG': 'Madagascar',
4053 'MW': 'Malawi',
4054 'MY': 'Malaysia',
4055 'MV': 'Maldives',
4056 'ML': 'Mali',
4057 'MT': 'Malta',
4058 'MH': 'Marshall Islands',
4059 'MQ': 'Martinique',
4060 'MR': 'Mauritania',
4061 'MU': 'Mauritius',
4062 'YT': 'Mayotte',
4063 'MX': 'Mexico',
4064 'FM': 'Micronesia, Federated States of',
4065 'MD': 'Moldova, Republic of',
4066 'MC': 'Monaco',
4067 'MN': 'Mongolia',
4068 'ME': 'Montenegro',
4069 'MS': 'Montserrat',
4070 'MA': 'Morocco',
4071 'MZ': 'Mozambique',
4072 'MM': 'Myanmar',
4073 'NA': 'Namibia',
4074 'NR': 'Nauru',
4075 'NP': 'Nepal',
4076 'NL': 'Netherlands',
4077 'NC': 'New Caledonia',
4078 'NZ': 'New Zealand',
4079 'NI': 'Nicaragua',
4080 'NE': 'Niger',
4081 'NG': 'Nigeria',
4082 'NU': 'Niue',
4083 'NF': 'Norfolk Island',
4084 'MP': 'Northern Mariana Islands',
4085 'NO': 'Norway',
4086 'OM': 'Oman',
4087 'PK': 'Pakistan',
4088 'PW': 'Palau',
4089 'PS': 'Palestine, State of',
4090 'PA': 'Panama',
4091 'PG': 'Papua New Guinea',
4092 'PY': 'Paraguay',
4093 'PE': 'Peru',
4094 'PH': 'Philippines',
4095 'PN': 'Pitcairn',
4096 'PL': 'Poland',
4097 'PT': 'Portugal',
4098 'PR': 'Puerto Rico',
4099 'QA': 'Qatar',
4100 'RE': 'Réunion',
4101 'RO': 'Romania',
4102 'RU': 'Russian Federation',
4103 'RW': 'Rwanda',
4104 'BL': 'Saint Barthélemy',
4105 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4106 'KN': 'Saint Kitts and Nevis',
4107 'LC': 'Saint Lucia',
4108 'MF': 'Saint Martin (French part)',
4109 'PM': 'Saint Pierre and Miquelon',
4110 'VC': 'Saint Vincent and the Grenadines',
4111 'WS': 'Samoa',
4112 'SM': 'San Marino',
4113 'ST': 'Sao Tome and Principe',
4114 'SA': 'Saudi Arabia',
4115 'SN': 'Senegal',
4116 'RS': 'Serbia',
4117 'SC': 'Seychelles',
4118 'SL': 'Sierra Leone',
4119 'SG': 'Singapore',
4120 'SX': 'Sint Maarten (Dutch part)',
4121 'SK': 'Slovakia',
4122 'SI': 'Slovenia',
4123 'SB': 'Solomon Islands',
4124 'SO': 'Somalia',
4125 'ZA': 'South Africa',
4126 'GS': 'South Georgia and the South Sandwich Islands',
4127 'SS': 'South Sudan',
4128 'ES': 'Spain',
4129 'LK': 'Sri Lanka',
4130 'SD': 'Sudan',
4131 'SR': 'Suriname',
4132 'SJ': 'Svalbard and Jan Mayen',
4133 'SZ': 'Swaziland',
4134 'SE': 'Sweden',
4135 'CH': 'Switzerland',
4136 'SY': 'Syrian Arab Republic',
4137 'TW': 'Taiwan, Province of China',
4138 'TJ': 'Tajikistan',
4139 'TZ': 'Tanzania, United Republic of',
4140 'TH': 'Thailand',
4141 'TL': 'Timor-Leste',
4142 'TG': 'Togo',
4143 'TK': 'Tokelau',
4144 'TO': 'Tonga',
4145 'TT': 'Trinidad and Tobago',
4146 'TN': 'Tunisia',
4147 'TR': 'Turkey',
4148 'TM': 'Turkmenistan',
4149 'TC': 'Turks and Caicos Islands',
4150 'TV': 'Tuvalu',
4151 'UG': 'Uganda',
4152 'UA': 'Ukraine',
4153 'AE': 'United Arab Emirates',
4154 'GB': 'United Kingdom',
4155 'US': 'United States',
4156 'UM': 'United States Minor Outlying Islands',
4157 'UY': 'Uruguay',
4158 'UZ': 'Uzbekistan',
4159 'VU': 'Vanuatu',
4160 'VE': 'Venezuela, Bolivarian Republic of',
4161 'VN': 'Viet Nam',
4162 'VG': 'Virgin Islands, British',
4163 'VI': 'Virgin Islands, U.S.',
4164 'WF': 'Wallis and Futuna',
4165 'EH': 'Western Sahara',
4166 'YE': 'Yemen',
4167 'ZM': 'Zambia',
4168 'ZW': 'Zimbabwe',
4169 }
4170
4171 @classmethod
4172 def short2full(cls, code):
4173 """Convert an ISO 3166-2 country code to the corresponding full name"""
4174 return cls._country_map.get(code.upper())
4175
4176
4177 class GeoUtils:
4178 # Major IPv4 address blocks per country
4179 _country_ip_map = {
4180 'AD': '46.172.224.0/19',
4181 'AE': '94.200.0.0/13',
4182 'AF': '149.54.0.0/17',
4183 'AG': '209.59.64.0/18',
4184 'AI': '204.14.248.0/21',
4185 'AL': '46.99.0.0/16',
4186 'AM': '46.70.0.0/15',
4187 'AO': '105.168.0.0/13',
4188 'AP': '182.50.184.0/21',
4189 'AQ': '23.154.160.0/24',
4190 'AR': '181.0.0.0/12',
4191 'AS': '202.70.112.0/20',
4192 'AT': '77.116.0.0/14',
4193 'AU': '1.128.0.0/11',
4194 'AW': '181.41.0.0/18',
4195 'AX': '185.217.4.0/22',
4196 'AZ': '5.197.0.0/16',
4197 'BA': '31.176.128.0/17',
4198 'BB': '65.48.128.0/17',
4199 'BD': '114.130.0.0/16',
4200 'BE': '57.0.0.0/8',
4201 'BF': '102.178.0.0/15',
4202 'BG': '95.42.0.0/15',
4203 'BH': '37.131.0.0/17',
4204 'BI': '154.117.192.0/18',
4205 'BJ': '137.255.0.0/16',
4206 'BL': '185.212.72.0/23',
4207 'BM': '196.12.64.0/18',
4208 'BN': '156.31.0.0/16',
4209 'BO': '161.56.0.0/16',
4210 'BQ': '161.0.80.0/20',
4211 'BR': '191.128.0.0/12',
4212 'BS': '24.51.64.0/18',
4213 'BT': '119.2.96.0/19',
4214 'BW': '168.167.0.0/16',
4215 'BY': '178.120.0.0/13',
4216 'BZ': '179.42.192.0/18',
4217 'CA': '99.224.0.0/11',
4218 'CD': '41.243.0.0/16',
4219 'CF': '197.242.176.0/21',
4220 'CG': '160.113.0.0/16',
4221 'CH': '85.0.0.0/13',
4222 'CI': '102.136.0.0/14',
4223 'CK': '202.65.32.0/19',
4224 'CL': '152.172.0.0/14',
4225 'CM': '102.244.0.0/14',
4226 'CN': '36.128.0.0/10',
4227 'CO': '181.240.0.0/12',
4228 'CR': '201.192.0.0/12',
4229 'CU': '152.206.0.0/15',
4230 'CV': '165.90.96.0/19',
4231 'CW': '190.88.128.0/17',
4232 'CY': '31.153.0.0/16',
4233 'CZ': '88.100.0.0/14',
4234 'DE': '53.0.0.0/8',
4235 'DJ': '197.241.0.0/17',
4236 'DK': '87.48.0.0/12',
4237 'DM': '192.243.48.0/20',
4238 'DO': '152.166.0.0/15',
4239 'DZ': '41.96.0.0/12',
4240 'EC': '186.68.0.0/15',
4241 'EE': '90.190.0.0/15',
4242 'EG': '156.160.0.0/11',
4243 'ER': '196.200.96.0/20',
4244 'ES': '88.0.0.0/11',
4245 'ET': '196.188.0.0/14',
4246 'EU': '2.16.0.0/13',
4247 'FI': '91.152.0.0/13',
4248 'FJ': '144.120.0.0/16',
4249 'FK': '80.73.208.0/21',
4250 'FM': '119.252.112.0/20',
4251 'FO': '88.85.32.0/19',
4252 'FR': '90.0.0.0/9',
4253 'GA': '41.158.0.0/15',
4254 'GB': '25.0.0.0/8',
4255 'GD': '74.122.88.0/21',
4256 'GE': '31.146.0.0/16',
4257 'GF': '161.22.64.0/18',
4258 'GG': '62.68.160.0/19',
4259 'GH': '154.160.0.0/12',
4260 'GI': '95.164.0.0/16',
4261 'GL': '88.83.0.0/19',
4262 'GM': '160.182.0.0/15',
4263 'GN': '197.149.192.0/18',
4264 'GP': '104.250.0.0/19',
4265 'GQ': '105.235.224.0/20',
4266 'GR': '94.64.0.0/13',
4267 'GT': '168.234.0.0/16',
4268 'GU': '168.123.0.0/16',
4269 'GW': '197.214.80.0/20',
4270 'GY': '181.41.64.0/18',
4271 'HK': '113.252.0.0/14',
4272 'HN': '181.210.0.0/16',
4273 'HR': '93.136.0.0/13',
4274 'HT': '148.102.128.0/17',
4275 'HU': '84.0.0.0/14',
4276 'ID': '39.192.0.0/10',
4277 'IE': '87.32.0.0/12',
4278 'IL': '79.176.0.0/13',
4279 'IM': '5.62.80.0/20',
4280 'IN': '117.192.0.0/10',
4281 'IO': '203.83.48.0/21',
4282 'IQ': '37.236.0.0/14',
4283 'IR': '2.176.0.0/12',
4284 'IS': '82.221.0.0/16',
4285 'IT': '79.0.0.0/10',
4286 'JE': '87.244.64.0/18',
4287 'JM': '72.27.0.0/17',
4288 'JO': '176.29.0.0/16',
4289 'JP': '133.0.0.0/8',
4290 'KE': '105.48.0.0/12',
4291 'KG': '158.181.128.0/17',
4292 'KH': '36.37.128.0/17',
4293 'KI': '103.25.140.0/22',
4294 'KM': '197.255.224.0/20',
4295 'KN': '198.167.192.0/19',
4296 'KP': '175.45.176.0/22',
4297 'KR': '175.192.0.0/10',
4298 'KW': '37.36.0.0/14',
4299 'KY': '64.96.0.0/15',
4300 'KZ': '2.72.0.0/13',
4301 'LA': '115.84.64.0/18',
4302 'LB': '178.135.0.0/16',
4303 'LC': '24.92.144.0/20',
4304 'LI': '82.117.0.0/19',
4305 'LK': '112.134.0.0/15',
4306 'LR': '102.183.0.0/16',
4307 'LS': '129.232.0.0/17',
4308 'LT': '78.56.0.0/13',
4309 'LU': '188.42.0.0/16',
4310 'LV': '46.109.0.0/16',
4311 'LY': '41.252.0.0/14',
4312 'MA': '105.128.0.0/11',
4313 'MC': '88.209.64.0/18',
4314 'MD': '37.246.0.0/16',
4315 'ME': '178.175.0.0/17',
4316 'MF': '74.112.232.0/21',
4317 'MG': '154.126.0.0/17',
4318 'MH': '117.103.88.0/21',
4319 'MK': '77.28.0.0/15',
4320 'ML': '154.118.128.0/18',
4321 'MM': '37.111.0.0/17',
4322 'MN': '49.0.128.0/17',
4323 'MO': '60.246.0.0/16',
4324 'MP': '202.88.64.0/20',
4325 'MQ': '109.203.224.0/19',
4326 'MR': '41.188.64.0/18',
4327 'MS': '208.90.112.0/22',
4328 'MT': '46.11.0.0/16',
4329 'MU': '105.16.0.0/12',
4330 'MV': '27.114.128.0/18',
4331 'MW': '102.70.0.0/15',
4332 'MX': '187.192.0.0/11',
4333 'MY': '175.136.0.0/13',
4334 'MZ': '197.218.0.0/15',
4335 'NA': '41.182.0.0/16',
4336 'NC': '101.101.0.0/18',
4337 'NE': '197.214.0.0/18',
4338 'NF': '203.17.240.0/22',
4339 'NG': '105.112.0.0/12',
4340 'NI': '186.76.0.0/15',
4341 'NL': '145.96.0.0/11',
4342 'NO': '84.208.0.0/13',
4343 'NP': '36.252.0.0/15',
4344 'NR': '203.98.224.0/19',
4345 'NU': '49.156.48.0/22',
4346 'NZ': '49.224.0.0/14',
4347 'OM': '5.36.0.0/15',
4348 'PA': '186.72.0.0/15',
4349 'PE': '186.160.0.0/14',
4350 'PF': '123.50.64.0/18',
4351 'PG': '124.240.192.0/19',
4352 'PH': '49.144.0.0/13',
4353 'PK': '39.32.0.0/11',
4354 'PL': '83.0.0.0/11',
4355 'PM': '70.36.0.0/20',
4356 'PR': '66.50.0.0/16',
4357 'PS': '188.161.0.0/16',
4358 'PT': '85.240.0.0/13',
4359 'PW': '202.124.224.0/20',
4360 'PY': '181.120.0.0/14',
4361 'QA': '37.210.0.0/15',
4362 'RE': '102.35.0.0/16',
4363 'RO': '79.112.0.0/13',
4364 'RS': '93.86.0.0/15',
4365 'RU': '5.136.0.0/13',
4366 'RW': '41.186.0.0/16',
4367 'SA': '188.48.0.0/13',
4368 'SB': '202.1.160.0/19',
4369 'SC': '154.192.0.0/11',
4370 'SD': '102.120.0.0/13',
4371 'SE': '78.64.0.0/12',
4372 'SG': '8.128.0.0/10',
4373 'SI': '188.196.0.0/14',
4374 'SK': '78.98.0.0/15',
4375 'SL': '102.143.0.0/17',
4376 'SM': '89.186.32.0/19',
4377 'SN': '41.82.0.0/15',
4378 'SO': '154.115.192.0/18',
4379 'SR': '186.179.128.0/17',
4380 'SS': '105.235.208.0/21',
4381 'ST': '197.159.160.0/19',
4382 'SV': '168.243.0.0/16',
4383 'SX': '190.102.0.0/20',
4384 'SY': '5.0.0.0/16',
4385 'SZ': '41.84.224.0/19',
4386 'TC': '65.255.48.0/20',
4387 'TD': '154.68.128.0/19',
4388 'TG': '196.168.0.0/14',
4389 'TH': '171.96.0.0/13',
4390 'TJ': '85.9.128.0/18',
4391 'TK': '27.96.24.0/21',
4392 'TL': '180.189.160.0/20',
4393 'TM': '95.85.96.0/19',
4394 'TN': '197.0.0.0/11',
4395 'TO': '175.176.144.0/21',
4396 'TR': '78.160.0.0/11',
4397 'TT': '186.44.0.0/15',
4398 'TV': '202.2.96.0/19',
4399 'TW': '120.96.0.0/11',
4400 'TZ': '156.156.0.0/14',
4401 'UA': '37.52.0.0/14',
4402 'UG': '102.80.0.0/13',
4403 'US': '6.0.0.0/8',
4404 'UY': '167.56.0.0/13',
4405 'UZ': '84.54.64.0/18',
4406 'VA': '212.77.0.0/19',
4407 'VC': '207.191.240.0/21',
4408 'VE': '186.88.0.0/13',
4409 'VG': '66.81.192.0/20',
4410 'VI': '146.226.0.0/16',
4411 'VN': '14.160.0.0/11',
4412 'VU': '202.80.32.0/20',
4413 'WF': '117.20.32.0/21',
4414 'WS': '202.4.32.0/19',
4415 'YE': '134.35.0.0/16',
4416 'YT': '41.242.116.0/22',
4417 'ZA': '41.0.0.0/11',
4418 'ZM': '102.144.0.0/13',
4419 'ZW': '102.177.192.0/18',
4420 }
4421
4422 @classmethod
4423 def random_ipv4(cls, code_or_block):
4424 if len(code_or_block) == 2:
4425 block = cls._country_ip_map.get(code_or_block.upper())
4426 if not block:
4427 return None
4428 else:
4429 block = code_or_block
4430 addr, preflen = block.split('/')
4431 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4432 addr_max = addr_min | (0xffffffff >> int(preflen))
4433 return compat_str(socket.inet_ntoa(
4434 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4435
4436
4437 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4438 def __init__(self, proxies=None):
4439 # Set default handlers
4440 for type in ('http', 'https'):
4441 setattr(self, '%s_open' % type,
4442 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4443 meth(r, proxy, type))
4444 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4445
4446 def proxy_open(self, req, proxy, type):
4447 req_proxy = req.headers.get('Ytdl-request-proxy')
4448 if req_proxy is not None:
4449 proxy = req_proxy
4450 del req.headers['Ytdl-request-proxy']
4451
4452 if proxy == '__noproxy__':
4453 return None # No Proxy
4454 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4455 req.add_header('Ytdl-socks-proxy', proxy)
4456 # yt-dlp's http/https handlers do wrapping the socket with socks
4457 return None
4458 return compat_urllib_request.ProxyHandler.proxy_open(
4459 self, req, proxy, type)
4460
4461
4462 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4463 # released into Public Domain
4464 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4465
4466 def long_to_bytes(n, blocksize=0):
4467 """long_to_bytes(n:long, blocksize:int) : string
4468 Convert a long integer to a byte string.
4469
4470 If optional blocksize is given and greater than zero, pad the front of the
4471 byte string with binary zeros so that the length is a multiple of
4472 blocksize.
4473 """
4474 # after much testing, this algorithm was deemed to be the fastest
4475 s = b''
4476 n = int(n)
4477 while n > 0:
4478 s = compat_struct_pack('>I', n & 0xffffffff) + s
4479 n = n >> 32
4480 # strip off leading zeros
4481 for i in range(len(s)):
4482 if s[i] != b'\000'[0]:
4483 break
4484 else:
4485 # only happens when n == 0
4486 s = b'\000'
4487 i = 0
4488 s = s[i:]
4489 # add back some pad bytes. this could be done more efficiently w.r.t. the
4490 # de-padding being done above, but sigh...
4491 if blocksize > 0 and len(s) % blocksize:
4492 s = (blocksize - len(s) % blocksize) * b'\000' + s
4493 return s
4494
4495
4496 def bytes_to_long(s):
4497 """bytes_to_long(string) : long
4498 Convert a byte string to a long integer.
4499
4500 This is (essentially) the inverse of long_to_bytes().
4501 """
4502 acc = 0
4503 length = len(s)
4504 if length % 4:
4505 extra = (4 - length % 4)
4506 s = b'\000' * extra + s
4507 length = length + extra
4508 for i in range(0, length, 4):
4509 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4510 return acc
4511
4512
4513 def ohdave_rsa_encrypt(data, exponent, modulus):
4514 '''
4515 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4516
4517 Input:
4518 data: data to encrypt, bytes-like object
4519 exponent, modulus: parameter e and N of RSA algorithm, both integer
4520 Output: hex string of encrypted data
4521
4522 Limitation: supports one block encryption only
4523 '''
4524
4525 payload = int(binascii.hexlify(data[::-1]), 16)
4526 encrypted = pow(payload, exponent, modulus)
4527 return '%x' % encrypted
4528
4529
4530 def pkcs1pad(data, length):
4531 """
4532 Padding input data with PKCS#1 scheme
4533
4534 @param {int[]} data input data
4535 @param {int} length target length
4536 @returns {int[]} padded data
4537 """
4538 if len(data) > length - 11:
4539 raise ValueError('Input data too long for PKCS#1 padding')
4540
4541 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4542 return [0, 2] + pseudo_random + [0] + data
4543
4544
4545 def encode_base_n(num, n, table=None):
4546 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4547 if not table:
4548 table = FULL_TABLE[:n]
4549
4550 if n > len(table):
4551 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4552
4553 if num == 0:
4554 return table[0]
4555
4556 ret = ''
4557 while num:
4558 ret = table[num % n] + ret
4559 num = num // n
4560 return ret
4561
4562
4563 def decode_packed_codes(code):
4564 mobj = re.search(PACKED_CODES_RE, code)
4565 obfuscated_code, base, count, symbols = mobj.groups()
4566 base = int(base)
4567 count = int(count)
4568 symbols = symbols.split('|')
4569 symbol_table = {}
4570
4571 while count:
4572 count -= 1
4573 base_n_count = encode_base_n(count, base)
4574 symbol_table[base_n_count] = symbols[count] or base_n_count
4575
4576 return re.sub(
4577 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4578 obfuscated_code)
4579
4580
4581 def caesar(s, alphabet, shift):
4582 if shift == 0:
4583 return s
4584 l = len(alphabet)
4585 return ''.join(
4586 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4587 for c in s)
4588
4589
4590 def rot47(s):
4591 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4592
4593
4594 def parse_m3u8_attributes(attrib):
4595 info = {}
4596 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4597 if val.startswith('"'):
4598 val = val[1:-1]
4599 info[key] = val
4600 return info
4601
4602
4603 def urshift(val, n):
4604 return val >> n if val >= 0 else (val + 0x100000000) >> n
4605
4606
4607 # Based on png2str() written by @gdkchan and improved by @yokrysty
4608 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4609 def decode_png(png_data):
4610 # Reference: https://www.w3.org/TR/PNG/
4611 header = png_data[8:]
4612
4613 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4614 raise OSError('Not a valid PNG file.')
4615
4616 int_map = {1: '>B', 2: '>H', 4: '>I'}
4617 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4618
4619 chunks = []
4620
4621 while header:
4622 length = unpack_integer(header[:4])
4623 header = header[4:]
4624
4625 chunk_type = header[:4]
4626 header = header[4:]
4627
4628 chunk_data = header[:length]
4629 header = header[length:]
4630
4631 header = header[4:] # Skip CRC
4632
4633 chunks.append({
4634 'type': chunk_type,
4635 'length': length,
4636 'data': chunk_data
4637 })
4638
4639 ihdr = chunks[0]['data']
4640
4641 width = unpack_integer(ihdr[:4])
4642 height = unpack_integer(ihdr[4:8])
4643
4644 idat = b''
4645
4646 for chunk in chunks:
4647 if chunk['type'] == b'IDAT':
4648 idat += chunk['data']
4649
4650 if not idat:
4651 raise OSError('Unable to read PNG data.')
4652
4653 decompressed_data = bytearray(zlib.decompress(idat))
4654
4655 stride = width * 3
4656 pixels = []
4657
4658 def _get_pixel(idx):
4659 x = idx % stride
4660 y = idx // stride
4661 return pixels[y][x]
4662
4663 for y in range(height):
4664 basePos = y * (1 + stride)
4665 filter_type = decompressed_data[basePos]
4666
4667 current_row = []
4668
4669 pixels.append(current_row)
4670
4671 for x in range(stride):
4672 color = decompressed_data[1 + basePos + x]
4673 basex = y * stride + x
4674 left = 0
4675 up = 0
4676
4677 if x > 2:
4678 left = _get_pixel(basex - 3)
4679 if y > 0:
4680 up = _get_pixel(basex - stride)
4681
4682 if filter_type == 1: # Sub
4683 color = (color + left) & 0xff
4684 elif filter_type == 2: # Up
4685 color = (color + up) & 0xff
4686 elif filter_type == 3: # Average
4687 color = (color + ((left + up) >> 1)) & 0xff
4688 elif filter_type == 4: # Paeth
4689 a = left
4690 b = up
4691 c = 0
4692
4693 if x > 2 and y > 0:
4694 c = _get_pixel(basex - stride - 3)
4695
4696 p = a + b - c
4697
4698 pa = abs(p - a)
4699 pb = abs(p - b)
4700 pc = abs(p - c)
4701
4702 if pa <= pb and pa <= pc:
4703 color = (color + a) & 0xff
4704 elif pb <= pc:
4705 color = (color + b) & 0xff
4706 else:
4707 color = (color + c) & 0xff
4708
4709 current_row.append(color)
4710
4711 return width, height, pixels
4712
4713
4714 def write_xattr(path, key, value):
4715 # Windows: Write xattrs to NTFS Alternate Data Streams:
4716 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4717 if compat_os_name == 'nt':
4718 assert ':' not in key
4719 assert os.path.exists(path)
4720
4721 try:
4722 with open(f'{path}:{key}', 'wb') as f:
4723 f.write(value)
4724 except OSError as e:
4725 raise XAttrMetadataError(e.errno, e.strerror)
4726 return
4727
4728 # UNIX Method 1. Use xattrs/pyxattrs modules
4729 from .dependencies import xattr
4730
4731 setxattr = None
4732 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4733 # Unicode arguments are not supported in pyxattr until version 0.5.0
4734 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4735 if version_tuple(xattr.__version__) >= (0, 5, 0):
4736 setxattr = xattr.set
4737 elif xattr:
4738 setxattr = xattr.setxattr
4739
4740 if setxattr:
4741 try:
4742 setxattr(path, key, value)
4743 except OSError as e:
4744 raise XAttrMetadataError(e.errno, e.strerror)
4745 return
4746
4747 # UNIX Method 2. Use setfattr/xattr executables
4748 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4749 else 'xattr' if check_executable('xattr', ['-h']) else None)
4750 if not exe:
4751 raise XAttrUnavailableError(
4752 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4753 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4754
4755 value = value.decode()
4756 try:
4757 p = Popen(
4758 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4759 stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4760 except OSError as e:
4761 raise XAttrMetadataError(e.errno, e.strerror)
4762 stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace')
4763 if p.returncode:
4764 raise XAttrMetadataError(p.returncode, stderr)
4765
4766
4767 def random_birthday(year_field, month_field, day_field):
4768 start_date = datetime.date(1950, 1, 1)
4769 end_date = datetime.date(1995, 12, 31)
4770 offset = random.randint(0, (end_date - start_date).days)
4771 random_date = start_date + datetime.timedelta(offset)
4772 return {
4773 year_field: str(random_date.year),
4774 month_field: str(random_date.month),
4775 day_field: str(random_date.day),
4776 }
4777
4778
4779 # Templates for internet shortcut files, which are plain text files.
4780 DOT_URL_LINK_TEMPLATE = '''\
4781 [InternetShortcut]
4782 URL=%(url)s
4783 '''
4784
4785 DOT_WEBLOC_LINK_TEMPLATE = '''\
4786 <?xml version="1.0" encoding="UTF-8"?>
4787 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4788 <plist version="1.0">
4789 <dict>
4790 \t<key>URL</key>
4791 \t<string>%(url)s</string>
4792 </dict>
4793 </plist>
4794 '''
4795
4796 DOT_DESKTOP_LINK_TEMPLATE = '''\
4797 [Desktop Entry]
4798 Encoding=UTF-8
4799 Name=%(filename)s
4800 Type=Link
4801 URL=%(url)s
4802 Icon=text-html
4803 '''
4804
4805 LINK_TEMPLATES = {
4806 'url': DOT_URL_LINK_TEMPLATE,
4807 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4808 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4809 }
4810
4811
4812 def iri_to_uri(iri):
4813 """
4814 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4815
4816 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4817 """
4818
4819 iri_parts = compat_urllib_parse_urlparse(iri)
4820
4821 if '[' in iri_parts.netloc:
4822 raise ValueError('IPv6 URIs are not, yet, supported.')
4823 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4824
4825 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4826
4827 net_location = ''
4828 if iri_parts.username:
4829 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4830 if iri_parts.password is not None:
4831 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4832 net_location += '@'
4833
4834 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
4835 # The 'idna' encoding produces ASCII text.
4836 if iri_parts.port is not None and iri_parts.port != 80:
4837 net_location += ':' + str(iri_parts.port)
4838
4839 return urllib.parse.urlunparse(
4840 (iri_parts.scheme,
4841 net_location,
4842
4843 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4844
4845 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4846 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4847
4848 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4849 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4850
4851 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4852
4853 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4854
4855
4856 def to_high_limit_path(path):
4857 if sys.platform in ['win32', 'cygwin']:
4858 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4859 return '\\\\?\\' + os.path.abspath(path)
4860
4861 return path
4862
4863
4864 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4865 val = traverse_obj(obj, *variadic(field))
4866 if val in ignore:
4867 return default
4868 return template % (func(val) if func else val)
4869
4870
4871 def clean_podcast_url(url):
4872 return re.sub(r'''(?x)
4873 (?:
4874 (?:
4875 chtbl\.com/track|
4876 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4877 play\.podtrac\.com
4878 )/[^/]+|
4879 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4880 flex\.acast\.com|
4881 pd(?:
4882 cn\.co| # https://podcorn.com/analytics-prefix/
4883 st\.fm # https://podsights.com/docs/
4884 )/e
4885 )/''', '', url)
4886
4887
4888 _HEX_TABLE = '0123456789abcdef'
4889
4890
4891 def random_uuidv4():
4892 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4893
4894
4895 def make_dir(path, to_screen=None):
4896 try:
4897 dn = os.path.dirname(path)
4898 if dn and not os.path.exists(dn):
4899 os.makedirs(dn)
4900 return True
4901 except OSError as err:
4902 if callable(to_screen) is not None:
4903 to_screen('unable to create directory ' + error_to_compat_str(err))
4904 return False
4905
4906
4907 def get_executable_path():
4908 from zipimport import zipimporter
4909 if hasattr(sys, 'frozen'): # Running from PyInstaller
4910 path = os.path.dirname(sys.executable)
4911 elif isinstance(__loader__, zipimporter): # Running from ZIP
4912 path = os.path.join(os.path.dirname(__file__), '../..')
4913 else:
4914 path = os.path.join(os.path.dirname(__file__), '..')
4915 return os.path.abspath(path)
4916
4917
4918 def load_plugins(name, suffix, namespace):
4919 classes = {}
4920 with contextlib.suppress(FileNotFoundError):
4921 plugins_spec = importlib.util.spec_from_file_location(
4922 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4923 plugins = importlib.util.module_from_spec(plugins_spec)
4924 sys.modules[plugins_spec.name] = plugins
4925 plugins_spec.loader.exec_module(plugins)
4926 for name in dir(plugins):
4927 if name in namespace:
4928 continue
4929 if not name.endswith(suffix):
4930 continue
4931 klass = getattr(plugins, name)
4932 classes[name] = namespace[name] = klass
4933 return classes
4934
4935
4936 def traverse_obj(
4937 obj, *path_list, default=None, expected_type=None, get_all=True,
4938 casesense=True, is_user_input=False, traverse_string=False):
4939 ''' Traverse nested list/dict/tuple
4940 @param path_list A list of paths which are checked one by one.
4941 Each path is a list of keys where each key is a:
4942 - None: Do nothing
4943 - string: A dictionary key
4944 - int: An index into a list
4945 - tuple: A list of keys all of which will be traversed
4946 - Ellipsis: Fetch all values in the object
4947 - Function: Takes the key and value as arguments
4948 and returns whether the key matches or not
4949 @param default Default value to return
4950 @param expected_type Only accept final value of this type (Can also be any callable)
4951 @param get_all Return all the values obtained from a path or only the first one
4952 @param casesense Whether to consider dictionary keys as case sensitive
4953 @param is_user_input Whether the keys are generated from user input. If True,
4954 strings are converted to int/slice if necessary
4955 @param traverse_string Whether to traverse inside strings. If True, any
4956 non-compatible object will also be converted into a string
4957 # TODO: Write tests
4958 '''
4959 if not casesense:
4960 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4961 path_list = (map(_lower, variadic(path)) for path in path_list)
4962
4963 def _traverse_obj(obj, path, _current_depth=0):
4964 nonlocal depth
4965 path = tuple(variadic(path))
4966 for i, key in enumerate(path):
4967 if None in (key, obj):
4968 return obj
4969 if isinstance(key, (list, tuple)):
4970 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4971 key = ...
4972 if key is ...:
4973 obj = (obj.values() if isinstance(obj, dict)
4974 else obj if isinstance(obj, (list, tuple, LazyList))
4975 else str(obj) if traverse_string else [])
4976 _current_depth += 1
4977 depth = max(depth, _current_depth)
4978 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4979 elif callable(key):
4980 if isinstance(obj, (list, tuple, LazyList)):
4981 obj = enumerate(obj)
4982 elif isinstance(obj, dict):
4983 obj = obj.items()
4984 else:
4985 if not traverse_string:
4986 return None
4987 obj = str(obj)
4988 _current_depth += 1
4989 depth = max(depth, _current_depth)
4990 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
4991 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4992 obj = (obj.get(key) if casesense or (key in obj)
4993 else next((v for k, v in obj.items() if _lower(k) == key), None))
4994 else:
4995 if is_user_input:
4996 key = (int_or_none(key) if ':' not in key
4997 else slice(*map(int_or_none, key.split(':'))))
4998 if key == slice(None):
4999 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5000 if not isinstance(key, (int, slice)):
5001 return None
5002 if not isinstance(obj, (list, tuple, LazyList)):
5003 if not traverse_string:
5004 return None
5005 obj = str(obj)
5006 try:
5007 obj = obj[key]
5008 except IndexError:
5009 return None
5010 return obj
5011
5012 if isinstance(expected_type, type):
5013 type_test = lambda val: val if isinstance(val, expected_type) else None
5014 elif expected_type is not None:
5015 type_test = expected_type
5016 else:
5017 type_test = lambda val: val
5018
5019 for path in path_list:
5020 depth = 0
5021 val = _traverse_obj(obj, path)
5022 if val is not None:
5023 if depth:
5024 for _ in range(depth - 1):
5025 val = itertools.chain.from_iterable(v for v in val if v is not None)
5026 val = [v for v in map(type_test, val) if v is not None]
5027 if val:
5028 return val if get_all else val[0]
5029 else:
5030 val = type_test(val)
5031 if val is not None:
5032 return val
5033 return default
5034
5035
5036 def traverse_dict(dictn, keys, casesense=True):
5037 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5038 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5039 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5040
5041
5042 def get_first(obj, keys, **kwargs):
5043 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5044
5045
5046 def variadic(x, allowed_types=(str, bytes, dict)):
5047 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5048
5049
5050 def decode_base(value, digits):
5051 # This will convert given base-x string to scalar (long or int)
5052 table = {char: index for index, char in enumerate(digits)}
5053 result = 0
5054 base = len(digits)
5055 for chr in value:
5056 result *= base
5057 result += table[chr]
5058 return result
5059
5060
5061 def time_seconds(**kwargs):
5062 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5063 return t.timestamp()
5064
5065
5066 # create a JSON Web Signature (jws) with HS256 algorithm
5067 # the resulting format is in JWS Compact Serialization
5068 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5069 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5070 def jwt_encode_hs256(payload_data, key, headers={}):
5071 header_data = {
5072 'alg': 'HS256',
5073 'typ': 'JWT',
5074 }
5075 if headers:
5076 header_data.update(headers)
5077 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5078 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5079 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5080 signature_b64 = base64.b64encode(h.digest())
5081 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5082 return token
5083
5084
5085 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5086 def jwt_decode_hs256(jwt):
5087 header_b64, payload_b64, signature_b64 = jwt.split('.')
5088 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5089 return payload_data
5090
5091
5092 def supports_terminal_sequences(stream):
5093 if compat_os_name == 'nt':
5094 from .compat import WINDOWS_VT_MODE # Must be imported locally
5095 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5096 return False
5097 elif not os.getenv('TERM'):
5098 return False
5099 try:
5100 return stream.isatty()
5101 except BaseException:
5102 return False
5103
5104
5105 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5106
5107
5108 def remove_terminal_sequences(string):
5109 return _terminal_sequences_re.sub('', string)
5110
5111
5112 def number_of_digits(number):
5113 return len('%d' % number)
5114
5115
5116 def join_nonempty(*values, delim='-', from_dict=None):
5117 if from_dict is not None:
5118 values = map(from_dict.get, values)
5119 return delim.join(map(str, filter(None, values)))
5120
5121
5122 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5123 """
5124 Find the largest format dimensions in terms of video width and, for each thumbnail:
5125 * Modify the URL: Match the width with the provided regex and replace with the former width
5126 * Update dimensions
5127
5128 This function is useful with video services that scale the provided thumbnails on demand
5129 """
5130 _keys = ('width', 'height')
5131 max_dimensions = max(
5132 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5133 default=(0, 0))
5134 if not max_dimensions[0]:
5135 return thumbnails
5136 return [
5137 merge_dicts(
5138 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5139 dict(zip(_keys, max_dimensions)), thumbnail)
5140 for thumbnail in thumbnails
5141 ]
5142
5143
5144 def parse_http_range(range):
5145 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5146 if not range:
5147 return None, None, None
5148 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5149 if not crg:
5150 return None, None, None
5151 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5152
5153
5154 class Config:
5155 own_args = None
5156 filename = None
5157 __initialized = False
5158
5159 def __init__(self, parser, label=None):
5160 self._parser, self.label = parser, label
5161 self._loaded_paths, self.configs = set(), []
5162
5163 def init(self, args=None, filename=None):
5164 assert not self.__initialized
5165 directory = ''
5166 if filename:
5167 location = os.path.realpath(filename)
5168 directory = os.path.dirname(location)
5169 if location in self._loaded_paths:
5170 return False
5171 self._loaded_paths.add(location)
5172
5173 self.__initialized = True
5174 self.own_args, self.filename = args, filename
5175 for location in self._parser.parse_args(args)[0].config_locations or []:
5176 location = os.path.join(directory, expand_path(location))
5177 if os.path.isdir(location):
5178 location = os.path.join(location, 'yt-dlp.conf')
5179 if not os.path.exists(location):
5180 self._parser.error(f'config location {location} does not exist')
5181 self.append_config(self.read_file(location), location)
5182 return True
5183
5184 def __str__(self):
5185 label = join_nonempty(
5186 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5187 delim=' ')
5188 return join_nonempty(
5189 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5190 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5191 delim='\n')
5192
5193 @staticmethod
5194 def read_file(filename, default=[]):
5195 try:
5196 optionf = open(filename)
5197 except OSError:
5198 return default # silently skip if file is not present
5199 try:
5200 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5201 contents = optionf.read()
5202 res = shlex.split(contents, comments=True)
5203 finally:
5204 optionf.close()
5205 return res
5206
5207 @staticmethod
5208 def hide_login_info(opts):
5209 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5210 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5211
5212 def _scrub_eq(o):
5213 m = eqre.match(o)
5214 if m:
5215 return m.group('key') + '=PRIVATE'
5216 else:
5217 return o
5218
5219 opts = list(map(_scrub_eq, opts))
5220 for idx, opt in enumerate(opts):
5221 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5222 opts[idx + 1] = 'PRIVATE'
5223 return opts
5224
5225 def append_config(self, *args, label=None):
5226 config = type(self)(self._parser, label)
5227 config._loaded_paths = self._loaded_paths
5228 if config.init(*args):
5229 self.configs.append(config)
5230
5231 @property
5232 def all_args(self):
5233 for config in reversed(self.configs):
5234 yield from config.all_args
5235 yield from self.own_args or []
5236
5237 def parse_args(self):
5238 return self._parser.parse_args(self.all_args)
5239
5240
5241 class WebSocketsWrapper():
5242 """Wraps websockets module to use in non-async scopes"""
5243 pool = None
5244
5245 def __init__(self, url, headers=None, connect=True):
5246 self.loop = asyncio.new_event_loop()
5247 # XXX: "loop" is deprecated
5248 self.conn = websockets.connect(
5249 url, extra_headers=headers, ping_interval=None,
5250 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5251 if connect:
5252 self.__enter__()
5253 atexit.register(self.__exit__, None, None, None)
5254
5255 def __enter__(self):
5256 if not self.pool:
5257 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5258 return self
5259
5260 def send(self, *args):
5261 self.run_with_loop(self.pool.send(*args), self.loop)
5262
5263 def recv(self, *args):
5264 return self.run_with_loop(self.pool.recv(*args), self.loop)
5265
5266 def __exit__(self, type, value, traceback):
5267 try:
5268 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5269 finally:
5270 self.loop.close()
5271 self._cancel_all_tasks(self.loop)
5272
5273 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5274 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5275 @staticmethod
5276 def run_with_loop(main, loop):
5277 if not asyncio.iscoroutine(main):
5278 raise ValueError(f'a coroutine was expected, got {main!r}')
5279
5280 try:
5281 return loop.run_until_complete(main)
5282 finally:
5283 loop.run_until_complete(loop.shutdown_asyncgens())
5284 if hasattr(loop, 'shutdown_default_executor'):
5285 loop.run_until_complete(loop.shutdown_default_executor())
5286
5287 @staticmethod
5288 def _cancel_all_tasks(loop):
5289 to_cancel = asyncio.all_tasks(loop)
5290
5291 if not to_cancel:
5292 return
5293
5294 for task in to_cancel:
5295 task.cancel()
5296
5297 # XXX: "loop" is removed in python 3.10+
5298 loop.run_until_complete(
5299 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5300
5301 for task in to_cancel:
5302 if task.cancelled():
5303 continue
5304 if task.exception() is not None:
5305 loop.call_exception_handler({
5306 'message': 'unhandled exception during asyncio.run() shutdown',
5307 'exception': task.exception(),
5308 'task': task,
5309 })
5310
5311
5312 def merge_headers(*dicts):
5313 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5314 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5315
5316
5317 class classproperty:
5318 def __init__(self, f):
5319 functools.update_wrapper(self, f)
5320 self.f = f
5321
5322 def __get__(self, _, cls):
5323 return self.f(cls)
5324
5325
5326 class Namespace:
5327 """Immutable namespace"""
5328
5329 def __init__(self, **kwargs):
5330 self._dict = kwargs
5331
5332 def __getattr__(self, attr):
5333 return self._dict[attr]
5334
5335 def __contains__(self, item):
5336 return item in self._dict.values()
5337
5338 def __iter__(self):
5339 return iter(self._dict.items())
5340
5341 def __repr__(self):
5342 return f'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
5343
5344
5345 # Deprecated
5346 has_certifi = bool(certifi)
5347 has_websockets = bool(websockets)