]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[cleanup] Misc
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import ctypes
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import importlib.util
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import mimetypes
24 import operator
25 import os
26 import platform
27 import random
28 import re
29 import shlex
30 import socket
31 import ssl
32 import subprocess
33 import sys
34 import tempfile
35 import time
36 import traceback
37 import urllib.parse
38 import xml.etree.ElementTree
39 import zlib
40
41 from .compat import asyncio, functools # isort: split
42 from .compat import (
43 compat_chr,
44 compat_cookiejar,
45 compat_etree_fromstring,
46 compat_expanduser,
47 compat_html_entities,
48 compat_html_entities_html5,
49 compat_HTMLParseError,
50 compat_HTMLParser,
51 compat_http_client,
52 compat_HTTPError,
53 compat_os_name,
54 compat_parse_qs,
55 compat_shlex_quote,
56 compat_str,
57 compat_struct_pack,
58 compat_struct_unpack,
59 compat_urllib_error,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_parse_urlencode,
62 compat_urllib_parse_urlparse,
63 compat_urllib_request,
64 compat_urlparse,
65 )
66 from .dependencies import brotli, certifi, websockets
67 from .socks import ProxyType, sockssocket
68
69
70 def register_socks_protocols():
71 # "Register" SOCKS protocols
72 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
73 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
74 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
75 if scheme not in compat_urlparse.uses_netloc:
76 compat_urlparse.uses_netloc.append(scheme)
77
78
79 # This is not clearly defined otherwise
80 compiled_regex_type = type(re.compile(''))
81
82
83 def random_user_agent():
84 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
85 _CHROME_VERSIONS = (
86 '90.0.4430.212',
87 '90.0.4430.24',
88 '90.0.4430.70',
89 '90.0.4430.72',
90 '90.0.4430.85',
91 '90.0.4430.93',
92 '91.0.4472.101',
93 '91.0.4472.106',
94 '91.0.4472.114',
95 '91.0.4472.124',
96 '91.0.4472.164',
97 '91.0.4472.19',
98 '91.0.4472.77',
99 '92.0.4515.107',
100 '92.0.4515.115',
101 '92.0.4515.131',
102 '92.0.4515.159',
103 '92.0.4515.43',
104 '93.0.4556.0',
105 '93.0.4577.15',
106 '93.0.4577.63',
107 '93.0.4577.82',
108 '94.0.4606.41',
109 '94.0.4606.54',
110 '94.0.4606.61',
111 '94.0.4606.71',
112 '94.0.4606.81',
113 '94.0.4606.85',
114 '95.0.4638.17',
115 '95.0.4638.50',
116 '95.0.4638.54',
117 '95.0.4638.69',
118 '95.0.4638.74',
119 '96.0.4664.18',
120 '96.0.4664.45',
121 '96.0.4664.55',
122 '96.0.4664.93',
123 '97.0.4692.20',
124 )
125 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
126
127
128 SUPPORTED_ENCODINGS = [
129 'gzip', 'deflate'
130 ]
131 if brotli:
132 SUPPORTED_ENCODINGS.append('br')
133
134 std_headers = {
135 'User-Agent': random_user_agent(),
136 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
137 'Accept-Language': 'en-us,en;q=0.5',
138 'Sec-Fetch-Mode': 'navigate',
139 }
140
141
142 USER_AGENTS = {
143 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
144 }
145
146
147 NO_DEFAULT = object()
148
149 ENGLISH_MONTH_NAMES = [
150 'January', 'February', 'March', 'April', 'May', 'June',
151 'July', 'August', 'September', 'October', 'November', 'December']
152
153 MONTH_NAMES = {
154 'en': ENGLISH_MONTH_NAMES,
155 'fr': [
156 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
157 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
158 }
159
160 KNOWN_EXTENSIONS = (
161 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
162 'flv', 'f4v', 'f4a', 'f4b',
163 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
164 'mkv', 'mka', 'mk3d',
165 'avi', 'divx',
166 'mov',
167 'asf', 'wmv', 'wma',
168 '3gp', '3g2',
169 'mp3',
170 'flac',
171 'ape',
172 'wav',
173 'f4f', 'f4m', 'm3u8', 'smil')
174
175 # needed for sanitizing filenames in restricted mode
176 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
177 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
178 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
179
180 DATE_FORMATS = (
181 '%d %B %Y',
182 '%d %b %Y',
183 '%B %d %Y',
184 '%B %dst %Y',
185 '%B %dnd %Y',
186 '%B %drd %Y',
187 '%B %dth %Y',
188 '%b %d %Y',
189 '%b %dst %Y',
190 '%b %dnd %Y',
191 '%b %drd %Y',
192 '%b %dth %Y',
193 '%b %dst %Y %I:%M',
194 '%b %dnd %Y %I:%M',
195 '%b %drd %Y %I:%M',
196 '%b %dth %Y %I:%M',
197 '%Y %m %d',
198 '%Y-%m-%d',
199 '%Y.%m.%d.',
200 '%Y/%m/%d',
201 '%Y/%m/%d %H:%M',
202 '%Y/%m/%d %H:%M:%S',
203 '%Y%m%d%H%M',
204 '%Y%m%d%H%M%S',
205 '%Y%m%d',
206 '%Y-%m-%d %H:%M',
207 '%Y-%m-%d %H:%M:%S',
208 '%Y-%m-%d %H:%M:%S.%f',
209 '%Y-%m-%d %H:%M:%S:%f',
210 '%d.%m.%Y %H:%M',
211 '%d.%m.%Y %H.%M',
212 '%Y-%m-%dT%H:%M:%SZ',
213 '%Y-%m-%dT%H:%M:%S.%fZ',
214 '%Y-%m-%dT%H:%M:%S.%f0Z',
215 '%Y-%m-%dT%H:%M:%S',
216 '%Y-%m-%dT%H:%M:%S.%f',
217 '%Y-%m-%dT%H:%M',
218 '%b %d %Y at %H:%M',
219 '%b %d %Y at %H:%M:%S',
220 '%B %d %Y at %H:%M',
221 '%B %d %Y at %H:%M:%S',
222 '%H:%M %d-%b-%Y',
223 )
224
225 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
226 DATE_FORMATS_DAY_FIRST.extend([
227 '%d-%m-%Y',
228 '%d.%m.%Y',
229 '%d.%m.%y',
230 '%d/%m/%Y',
231 '%d/%m/%y',
232 '%d/%m/%Y %H:%M:%S',
233 ])
234
235 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
236 DATE_FORMATS_MONTH_FIRST.extend([
237 '%m-%d-%Y',
238 '%m.%d.%Y',
239 '%m/%d/%Y',
240 '%m/%d/%y',
241 '%m/%d/%Y %H:%M:%S',
242 ])
243
244 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
245 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
246
247 NUMBER_RE = r'\d+(?:\.\d+)?'
248
249
250 @functools.cache
251 def preferredencoding():
252 """Get preferred encoding.
253
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
256 """
257 try:
258 pref = locale.getpreferredencoding()
259 'TEST'.encode(pref)
260 except Exception:
261 pref = 'UTF-8'
262
263 return pref
264
265
266 def write_json_file(obj, fn):
267 """ Encode obj as JSON and write it to fn, atomically if possible """
268
269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
271 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
272
273 try:
274 with tf:
275 json.dump(obj, tf, ensure_ascii=False)
276 if sys.platform == 'win32':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
279 with contextlib.suppress(OSError):
280 os.unlink(fn)
281 with contextlib.suppress(OSError):
282 mask = os.umask(0)
283 os.umask(mask)
284 os.chmod(tf.name, 0o666 & ~mask)
285 os.rename(tf.name, fn)
286 except Exception:
287 with contextlib.suppress(OSError):
288 os.remove(tf.name)
289 raise
290
291
292 def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^[a-zA-Z_-]+$', key)
295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
296 return node.find(expr)
297
298 # On python2.6 the xml.etree.ElementTree.Element methods don't support
299 # the namespace parameter
300
301
302 def xpath_with_ns(path, ns_map):
303 components = [c.split(':') for c in path.split('/')]
304 replaced = []
305 for c in components:
306 if len(c) == 1:
307 replaced.append(c[0])
308 else:
309 ns, tag = c
310 replaced.append('{%s}%s' % (ns_map[ns], tag))
311 return '/'.join(replaced)
312
313
314 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
315 def _find_xpath(xpath):
316 return node.find(xpath)
317
318 if isinstance(xpath, (str, compat_str)):
319 n = _find_xpath(xpath)
320 else:
321 for xp in xpath:
322 n = _find_xpath(xp)
323 if n is not None:
324 break
325
326 if n is None:
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
330 name = xpath if name is None else name
331 raise ExtractorError('Could not find XML element %s' % name)
332 else:
333 return None
334 return n
335
336
337 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
338 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
339 if n is None or n == default:
340 return n
341 if n.text is None:
342 if default is not NO_DEFAULT:
343 return default
344 elif fatal:
345 name = xpath if name is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name)
347 else:
348 return None
349 return n.text
350
351
352 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
353 n = find_xpath_attr(node, xpath, key)
354 if n is None:
355 if default is not NO_DEFAULT:
356 return default
357 elif fatal:
358 name = f'{xpath}[@{key}]' if name is None else name
359 raise ExtractorError('Could not find XML attribute %s' % name)
360 else:
361 return None
362 return n.attrib[key]
363
364
365 def get_element_by_id(id, html, **kwargs):
366 """Return the content of the tag with the specified ID in the passed HTML document"""
367 return get_element_by_attribute('id', id, html, **kwargs)
368
369
370 def get_element_html_by_id(id, html, **kwargs):
371 """Return the html of the tag with the specified ID in the passed HTML document"""
372 return get_element_html_by_attribute('id', id, html, **kwargs)
373
374
375 def get_element_by_class(class_name, html):
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
381 def get_element_html_by_class(class_name, html):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval = get_elements_html_by_class(class_name, html)
384 return retval[0] if retval else None
385
386
387 def get_element_by_attribute(attribute, value, html, **kwargs):
388 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
389 return retval[0] if retval else None
390
391
392 def get_element_html_by_attribute(attribute, value, html, **kargs):
393 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
394 return retval[0] if retval else None
395
396
397 def get_elements_by_class(class_name, html, **kargs):
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
400 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
402
403
404 def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
409
410
411 def get_elements_by_attribute(*args, **kwargs):
412 """Return the content of the tag with the specified attribute in the passed HTML document"""
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416 def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
419
420
421 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
422 """
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
425 """
426
427 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
428
429 value = re.escape(value) if escape_value else value
430
431 partial_element_re = rf'''(?x)
432 <(?P<tag>[a-zA-Z0-9:._-]+)
433 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
435 '''
436
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
439
440 yield (
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
442 whole
443 )
444
445
446 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
447 """
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
450 as a context manager
451 """
452
453 class HTMLBreakOnClosingTagException(Exception):
454 pass
455
456 def __init__(self):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
459
460 def __enter__(self):
461 return self
462
463 def __exit__(self, *_):
464 self.close()
465
466 def close(self):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
470 pass
471
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
474
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags in the stack')
478 while self.tagstack:
479 inner_tag = self.tagstack.pop()
480 if inner_tag == tag:
481 break
482 else:
483 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
486
487
488 def get_element_text_and_html_by_tag(tag, html):
489 """
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text) and the whole element (html)
492 """
493 def find_or_raise(haystack, needle, exc):
494 try:
495 return haystack.index(needle)
496 except ValueError:
497 raise exc
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
514 try:
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
521
522
523 class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes for a single element"""
525
526 def __init__(self):
527 self.attrs = {}
528 compat_HTMLParser.__init__(self)
529
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
532
533
534 class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes for the elements of a list"""
536
537 def __init__(self):
538 compat_HTMLParser.__init__(self)
539 self.items = []
540 self._level = 0
541
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
545 self._level += 1
546
547 def handle_endtag(self, tag):
548 self._level -= 1
549
550
551 def extract_attributes(html_element):
552 """Given a string for an HTML element such as
553 <el
554 a="foo" B="bar" c="&98;az" d=boz
555 empty= noval entity="&amp;"
556 sq='"' dq="'"
557 >
558 Decode and return a dictionary of attributes.
559 {
560 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
563 }.
564 """
565 parser = HTMLAttributeParser()
566 with contextlib.suppress(compat_HTMLParseError):
567 parser.feed(html_element)
568 parser.close()
569 return parser.attrs
570
571
572 def parse_list(webpage):
573 """Given a string for an series of HTML <li> elements,
574 return a dictionary of their attributes"""
575 parser = HTMLListAttrsParser()
576 parser.feed(webpage)
577 parser.close()
578 return parser.items
579
580
581 def clean_html(html):
582 """Clean an HTML snippet into a readable string"""
583
584 if html is None: # Convenience for sanitizing descriptions etc.
585 return html
586
587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
590 # Strip html tags
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
594 return html.strip()
595
596
597 def sanitize_open(filename, open_mode):
598 """Try to open the given filename, and slightly tweak it if this fails.
599
600 Attempts to open the given filename. If this fails, it tries to change
601 the filename slightly, step by step, until it's either able to open it
602 or it fails and raises a final exception, like the standard open()
603 function.
604
605 It returns the tuple (stream, definitive_file_name).
606 """
607 if filename == '-':
608 if sys.platform == 'win32':
609 import msvcrt
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
612
613 for attempt in range(2):
614 try:
615 try:
616 if sys.platform == 'win32':
617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the file on windows (for now).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
620 raise LockingUnsupportedError()
621 stream = locked_file(filename, open_mode, block=False).__enter__()
622 except LockingUnsupportedError:
623 stream = open(filename, open_mode)
624 return (stream, filename)
625 except OSError as err:
626 if attempt or err.errno in (errno.EACCES,):
627 raise
628 old_filename, filename = filename, sanitize_path(filename)
629 if old_filename == filename:
630 raise
631
632
633 def timeconvert(timestr):
634 """Convert RFC 2822 defined time string into system timestamp"""
635 timestamp = None
636 timetuple = email.utils.parsedate_tz(timestr)
637 if timetuple is not None:
638 timestamp = email.utils.mktime_tz(timetuple)
639 return timestamp
640
641
642 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
643 """Sanitizes a string so it could be used as part of a filename.
644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
647 """
648 if s == '':
649 return ''
650
651 def replace_insane(char):
652 if restricted and char in ACCENT_CHARS:
653 return ACCENT_CHARS[char]
654 elif not restricted and char == '\n':
655 return '\0 '
656 elif char == '?' or ord(char) < 32 or ord(char) == 127:
657 return ''
658 elif char == '"':
659 return '' if restricted else '\''
660 elif char == ':':
661 return '\0_\0-' if restricted else '\0 \0-'
662 elif char in '\\/|*<>':
663 return '\0_'
664 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
665 return '\0_'
666 return char
667
668 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
669 result = ''.join(map(replace_insane, s))
670 if is_id is NO_DEFAULT:
671 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
672 STRIP_RE = '(?:\0.|[ _-])*'
673 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
674 result = result.replace('\0', '') or '_'
675
676 if not is_id:
677 while '__' in result:
678 result = result.replace('__', '_')
679 result = result.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted and result.startswith('-_'):
682 result = result[2:]
683 if result.startswith('-'):
684 result = '_' + result[len('-'):]
685 result = result.lstrip('.')
686 if not result:
687 result = '_'
688 return result
689
690
691 def sanitize_path(s, force=False):
692 """Sanitizes and normalizes path on Windows"""
693 if sys.platform == 'win32':
694 force = False
695 drive_or_unc, _ = os.path.splitdrive(s)
696 elif force:
697 drive_or_unc = ''
698 else:
699 return s
700
701 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
702 if drive_or_unc:
703 norm_path.pop(0)
704 sanitized_path = [
705 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
706 for path_part in norm_path]
707 if drive_or_unc:
708 sanitized_path.insert(0, drive_or_unc + os.path.sep)
709 elif force and s and s[0] == os.path.sep:
710 sanitized_path.insert(0, os.path.sep)
711 return os.path.join(*sanitized_path)
712
713
714 def sanitize_url(url):
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
717 if url is None:
718 return
719 elif url.startswith('//'):
720 return 'http:%s' % url
721 # Fix some common typos seen so far
722 COMMON_TYPOS = (
723 # https://github.com/ytdl-org/youtube-dl/issues/15649
724 (r'^httpss://', r'https://'),
725 # https://bx1.be/lives/direct-tv/
726 (r'^rmtp([es]?)://', r'rtmp\1://'),
727 )
728 for mistake, fixup in COMMON_TYPOS:
729 if re.match(mistake, url):
730 return re.sub(mistake, fixup, url)
731 return url
732
733
734 def extract_basic_auth(url):
735 parts = compat_urlparse.urlsplit(url)
736 if parts.username is None:
737 return url, None
738 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
739 parts.hostname if parts.port is None
740 else '%s:%d' % (parts.hostname, parts.port))))
741 auth_payload = base64.b64encode(
742 ('%s:%s' % (parts.username, parts.password or '')).encode())
743 return url, f'Basic {auth_payload.decode()}'
744
745
746 def sanitized_Request(url, *args, **kwargs):
747 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
748 if auth_header is not None:
749 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
750 headers['Authorization'] = auth_header
751 return compat_urllib_request.Request(url, *args, **kwargs)
752
753
754 def expand_path(s):
755 """Expand shell variables and ~"""
756 return os.path.expandvars(compat_expanduser(s))
757
758
759 def orderedSet(iterable):
760 """ Remove all duplicates from the input iterable """
761 res = []
762 for el in iterable:
763 if el not in res:
764 res.append(el)
765 return res
766
767
768 def _htmlentity_transform(entity_with_semicolon):
769 """Transforms an HTML entity to a character."""
770 entity = entity_with_semicolon[:-1]
771
772 # Known non-numeric HTML entity
773 if entity in compat_html_entities.name2codepoint:
774 return compat_chr(compat_html_entities.name2codepoint[entity])
775
776 # TODO: HTML5 allows entities without a semicolon. For example,
777 # '&Eacuteric' should be decoded as 'Éric'.
778 if entity_with_semicolon in compat_html_entities_html5:
779 return compat_html_entities_html5[entity_with_semicolon]
780
781 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
782 if mobj is not None:
783 numstr = mobj.group(1)
784 if numstr.startswith('x'):
785 base = 16
786 numstr = '0%s' % numstr
787 else:
788 base = 10
789 # See https://github.com/ytdl-org/youtube-dl/issues/7518
790 with contextlib.suppress(ValueError):
791 return compat_chr(int(numstr, base))
792
793 # Unknown entity in name, return its literal representation
794 return '&%s;' % entity
795
796
797 def unescapeHTML(s):
798 if s is None:
799 return None
800 assert isinstance(s, str)
801
802 return re.sub(
803 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
804
805
806 def escapeHTML(text):
807 return (
808 text
809 .replace('&', '&amp;')
810 .replace('<', '&lt;')
811 .replace('>', '&gt;')
812 .replace('"', '&quot;')
813 .replace("'", '&#39;')
814 )
815
816
817 def process_communicate_or_kill(p, *args, **kwargs):
818 try:
819 return p.communicate(*args, **kwargs)
820 except BaseException: # Including KeyboardInterrupt
821 p.kill()
822 p.wait()
823 raise
824
825
826 class Popen(subprocess.Popen):
827 if sys.platform == 'win32':
828 _startupinfo = subprocess.STARTUPINFO()
829 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
830 else:
831 _startupinfo = None
832
833 def __init__(self, *args, **kwargs):
834 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
835
836 def communicate_or_kill(self, *args, **kwargs):
837 return process_communicate_or_kill(self, *args, **kwargs)
838
839
840 def get_subprocess_encoding():
841 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
842 # For subprocess calls, encode with locale encoding
843 # Refer to http://stackoverflow.com/a/9951851/35070
844 encoding = preferredencoding()
845 else:
846 encoding = sys.getfilesystemencoding()
847 if encoding is None:
848 encoding = 'utf-8'
849 return encoding
850
851
852 def encodeFilename(s, for_subprocess=False):
853 assert isinstance(s, str)
854 return s
855
856
857 def decodeFilename(b, for_subprocess=False):
858 return b
859
860
861 def encodeArgument(s):
862 # Legacy code that uses byte strings
863 # Uncomment the following line after fixing all post processors
864 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
865 return s if isinstance(s, str) else s.decode('ascii')
866
867
868 def decodeArgument(b):
869 return b
870
871
872 def decodeOption(optval):
873 if optval is None:
874 return optval
875 if isinstance(optval, bytes):
876 optval = optval.decode(preferredencoding())
877
878 assert isinstance(optval, compat_str)
879 return optval
880
881
882 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
883
884
885 def timetuple_from_msec(msec):
886 secs, msec = divmod(msec, 1000)
887 mins, secs = divmod(secs, 60)
888 hrs, mins = divmod(mins, 60)
889 return _timetuple(hrs, mins, secs, msec)
890
891
892 def formatSeconds(secs, delim=':', msec=False):
893 time = timetuple_from_msec(secs * 1000)
894 if time.hours:
895 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
896 elif time.minutes:
897 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
898 else:
899 ret = '%d' % time.seconds
900 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
901
902
903 def _ssl_load_windows_store_certs(ssl_context, storename):
904 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
905 try:
906 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
907 if encoding == 'x509_asn' and (
908 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
909 except PermissionError:
910 return
911 for cert in certs:
912 with contextlib.suppress(ssl.SSLError):
913 ssl_context.load_verify_locations(cadata=cert)
914
915
916 def make_HTTPS_handler(params, **kwargs):
917 opts_check_certificate = not params.get('nocheckcertificate')
918 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
919 context.check_hostname = opts_check_certificate
920 if params.get('legacyserverconnect'):
921 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
922 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
923 context.set_ciphers('DEFAULT')
924 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
925 if opts_check_certificate:
926 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
927 context.load_verify_locations(cafile=certifi.where())
928 else:
929 try:
930 context.load_default_certs()
931 # Work around the issue in load_default_certs when there are bad certificates. See:
932 # https://github.com/yt-dlp/yt-dlp/issues/1060,
933 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
934 except ssl.SSLError:
935 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
936 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
937 for storename in ('CA', 'ROOT'):
938 _ssl_load_windows_store_certs(context, storename)
939 context.set_default_verify_paths()
940 client_certfile = params.get('client_certificate')
941 if client_certfile:
942 try:
943 context.load_cert_chain(
944 client_certfile, keyfile=params.get('client_certificate_key'),
945 password=params.get('client_certificate_password'))
946 except ssl.SSLError:
947 raise YoutubeDLError('Unable to load client certificate')
948 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
949
950
951 def bug_reports_message(before=';'):
952 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
953 'filling out the appropriate issue template. '
954 'Confirm you are on the latest version using yt-dlp -U')
955
956 before = before.rstrip()
957 if not before or before.endswith(('.', '!', '?')):
958 msg = msg[0].title() + msg[1:]
959
960 return (before + ' ' if before else '') + msg
961
962
963 class YoutubeDLError(Exception):
964 """Base exception for YoutubeDL errors."""
965 msg = None
966
967 def __init__(self, msg=None):
968 if msg is not None:
969 self.msg = msg
970 elif self.msg is None:
971 self.msg = type(self).__name__
972 super().__init__(self.msg)
973
974
975 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
976 if hasattr(ssl, 'CertificateError'):
977 network_exceptions.append(ssl.CertificateError)
978 network_exceptions = tuple(network_exceptions)
979
980
981 class ExtractorError(YoutubeDLError):
982 """Error during info extraction."""
983
984 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
985 """ tb, if given, is the original traceback (so that it can be printed out).
986 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
987 """
988 if sys.exc_info()[0] in network_exceptions:
989 expected = True
990
991 self.orig_msg = str(msg)
992 self.traceback = tb
993 self.expected = expected
994 self.cause = cause
995 self.video_id = video_id
996 self.ie = ie
997 self.exc_info = sys.exc_info() # preserve original exception
998
999 super().__init__(''.join((
1000 format_field(ie, template='[%s] '),
1001 format_field(video_id, template='%s: '),
1002 msg,
1003 format_field(cause, template=' (caused by %r)'),
1004 '' if expected else bug_reports_message())))
1005
1006 def format_traceback(self):
1007 return join_nonempty(
1008 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1009 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1010 delim='\n') or None
1011
1012
1013 class UnsupportedError(ExtractorError):
1014 def __init__(self, url):
1015 super().__init__(
1016 'Unsupported URL: %s' % url, expected=True)
1017 self.url = url
1018
1019
1020 class RegexNotFoundError(ExtractorError):
1021 """Error when a regex didn't match"""
1022 pass
1023
1024
1025 class GeoRestrictedError(ExtractorError):
1026 """Geographic restriction Error exception.
1027
1028 This exception may be thrown when a video is not available from your
1029 geographic location due to geographic restrictions imposed by a website.
1030 """
1031
1032 def __init__(self, msg, countries=None, **kwargs):
1033 kwargs['expected'] = True
1034 super().__init__(msg, **kwargs)
1035 self.countries = countries
1036
1037
1038 class DownloadError(YoutubeDLError):
1039 """Download Error exception.
1040
1041 This exception may be thrown by FileDownloader objects if they are not
1042 configured to continue on errors. They will contain the appropriate
1043 error message.
1044 """
1045
1046 def __init__(self, msg, exc_info=None):
1047 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1048 super().__init__(msg)
1049 self.exc_info = exc_info
1050
1051
1052 class EntryNotInPlaylist(YoutubeDLError):
1053 """Entry not in playlist exception.
1054
1055 This exception will be thrown by YoutubeDL when a requested entry
1056 is not found in the playlist info_dict
1057 """
1058 msg = 'Entry not found in info'
1059
1060
1061 class SameFileError(YoutubeDLError):
1062 """Same File exception.
1063
1064 This exception will be thrown by FileDownloader objects if they detect
1065 multiple files would have to be downloaded to the same file on disk.
1066 """
1067 msg = 'Fixed output name but more than one file to download'
1068
1069 def __init__(self, filename=None):
1070 if filename is not None:
1071 self.msg += f': {filename}'
1072 super().__init__(self.msg)
1073
1074
1075 class PostProcessingError(YoutubeDLError):
1076 """Post Processing exception.
1077
1078 This exception may be raised by PostProcessor's .run() method to
1079 indicate an error in the postprocessing task.
1080 """
1081
1082
1083 class DownloadCancelled(YoutubeDLError):
1084 """ Exception raised when the download queue should be interrupted """
1085 msg = 'The download was cancelled'
1086
1087
1088 class ExistingVideoReached(DownloadCancelled):
1089 """ --break-on-existing triggered """
1090 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1091
1092
1093 class RejectedVideoReached(DownloadCancelled):
1094 """ --break-on-reject triggered """
1095 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1096
1097
1098 class MaxDownloadsReached(DownloadCancelled):
1099 """ --max-downloads limit has been reached. """
1100 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1101
1102
1103 class ReExtractInfo(YoutubeDLError):
1104 """ Video info needs to be re-extracted. """
1105
1106 def __init__(self, msg, expected=False):
1107 super().__init__(msg)
1108 self.expected = expected
1109
1110
1111 class ThrottledDownload(ReExtractInfo):
1112 """ Download speed below --throttled-rate. """
1113 msg = 'The download speed is below throttle limit'
1114
1115 def __init__(self):
1116 super().__init__(self.msg, expected=False)
1117
1118
1119 class UnavailableVideoError(YoutubeDLError):
1120 """Unavailable Format exception.
1121
1122 This exception will be thrown when a video is requested
1123 in a format that is not available for that video.
1124 """
1125 msg = 'Unable to download video'
1126
1127 def __init__(self, err=None):
1128 if err is not None:
1129 self.msg += f': {err}'
1130 super().__init__(self.msg)
1131
1132
1133 class ContentTooShortError(YoutubeDLError):
1134 """Content Too Short exception.
1135
1136 This exception may be raised by FileDownloader objects when a file they
1137 download is too small for what the server announced first, indicating
1138 the connection was probably interrupted.
1139 """
1140
1141 def __init__(self, downloaded, expected):
1142 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1143 # Both in bytes
1144 self.downloaded = downloaded
1145 self.expected = expected
1146
1147
1148 class XAttrMetadataError(YoutubeDLError):
1149 def __init__(self, code=None, msg='Unknown error'):
1150 super().__init__(msg)
1151 self.code = code
1152 self.msg = msg
1153
1154 # Parsing code and msg
1155 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1156 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1157 self.reason = 'NO_SPACE'
1158 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1159 self.reason = 'VALUE_TOO_LONG'
1160 else:
1161 self.reason = 'NOT_SUPPORTED'
1162
1163
1164 class XAttrUnavailableError(YoutubeDLError):
1165 pass
1166
1167
1168 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1169 hc = http_class(*args, **kwargs)
1170 source_address = ydl_handler._params.get('source_address')
1171
1172 if source_address is not None:
1173 # This is to workaround _create_connection() from socket where it will try all
1174 # address data from getaddrinfo() including IPv6. This filters the result from
1175 # getaddrinfo() based on the source_address value.
1176 # This is based on the cpython socket.create_connection() function.
1177 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1178 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1179 host, port = address
1180 err = None
1181 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1182 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1183 ip_addrs = [addr for addr in addrs if addr[0] == af]
1184 if addrs and not ip_addrs:
1185 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1186 raise OSError(
1187 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1188 % (ip_version, source_address[0]))
1189 for res in ip_addrs:
1190 af, socktype, proto, canonname, sa = res
1191 sock = None
1192 try:
1193 sock = socket.socket(af, socktype, proto)
1194 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1195 sock.settimeout(timeout)
1196 sock.bind(source_address)
1197 sock.connect(sa)
1198 err = None # Explicitly break reference cycle
1199 return sock
1200 except OSError as _:
1201 err = _
1202 if sock is not None:
1203 sock.close()
1204 if err is not None:
1205 raise err
1206 else:
1207 raise OSError('getaddrinfo returns an empty list')
1208 if hasattr(hc, '_create_connection'):
1209 hc._create_connection = _create_connection
1210 hc.source_address = (source_address, 0)
1211
1212 return hc
1213
1214
1215 def handle_youtubedl_headers(headers):
1216 filtered_headers = headers
1217
1218 if 'Youtubedl-no-compression' in filtered_headers:
1219 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1220 del filtered_headers['Youtubedl-no-compression']
1221
1222 return filtered_headers
1223
1224
1225 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1226 """Handler for HTTP requests and responses.
1227
1228 This class, when installed with an OpenerDirector, automatically adds
1229 the standard headers to every HTTP request and handles gzipped and
1230 deflated responses from web servers. If compression is to be avoided in
1231 a particular request, the original request in the program code only has
1232 to include the HTTP header "Youtubedl-no-compression", which will be
1233 removed before making the real request.
1234
1235 Part of this code was copied from:
1236
1237 http://techknack.net/python-urllib2-handlers/
1238
1239 Andrew Rowls, the author of that code, agreed to release it to the
1240 public domain.
1241 """
1242
1243 def __init__(self, params, *args, **kwargs):
1244 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1245 self._params = params
1246
1247 def http_open(self, req):
1248 conn_class = compat_http_client.HTTPConnection
1249
1250 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1251 if socks_proxy:
1252 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1253 del req.headers['Ytdl-socks-proxy']
1254
1255 return self.do_open(functools.partial(
1256 _create_http_connection, self, conn_class, False),
1257 req)
1258
1259 @staticmethod
1260 def deflate(data):
1261 if not data:
1262 return data
1263 try:
1264 return zlib.decompress(data, -zlib.MAX_WBITS)
1265 except zlib.error:
1266 return zlib.decompress(data)
1267
1268 @staticmethod
1269 def brotli(data):
1270 if not data:
1271 return data
1272 return brotli.decompress(data)
1273
1274 def http_request(self, req):
1275 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1276 # always respected by websites, some tend to give out URLs with non percent-encoded
1277 # non-ASCII characters (see telemb.py, ard.py [#3412])
1278 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1279 # To work around aforementioned issue we will replace request's original URL with
1280 # percent-encoded one
1281 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1282 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1283 url = req.get_full_url()
1284 url_escaped = escape_url(url)
1285
1286 # Substitute URL if any change after escaping
1287 if url != url_escaped:
1288 req = update_Request(req, url=url_escaped)
1289
1290 for h, v in self._params.get('http_headers', std_headers).items():
1291 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1292 # The dict keys are capitalized because of this bug by urllib
1293 if h.capitalize() not in req.headers:
1294 req.add_header(h, v)
1295
1296 if 'Accept-encoding' not in req.headers:
1297 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1298
1299 req.headers = handle_youtubedl_headers(req.headers)
1300
1301 return req
1302
1303 def http_response(self, req, resp):
1304 old_resp = resp
1305 # gzip
1306 if resp.headers.get('Content-encoding', '') == 'gzip':
1307 content = resp.read()
1308 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1309 try:
1310 uncompressed = io.BytesIO(gz.read())
1311 except OSError as original_ioerror:
1312 # There may be junk add the end of the file
1313 # See http://stackoverflow.com/q/4928560/35070 for details
1314 for i in range(1, 1024):
1315 try:
1316 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1317 uncompressed = io.BytesIO(gz.read())
1318 except OSError:
1319 continue
1320 break
1321 else:
1322 raise original_ioerror
1323 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1324 resp.msg = old_resp.msg
1325 del resp.headers['Content-encoding']
1326 # deflate
1327 if resp.headers.get('Content-encoding', '') == 'deflate':
1328 gz = io.BytesIO(self.deflate(resp.read()))
1329 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1330 resp.msg = old_resp.msg
1331 del resp.headers['Content-encoding']
1332 # brotli
1333 if resp.headers.get('Content-encoding', '') == 'br':
1334 resp = compat_urllib_request.addinfourl(
1335 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1336 resp.msg = old_resp.msg
1337 del resp.headers['Content-encoding']
1338 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1339 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1340 if 300 <= resp.code < 400:
1341 location = resp.headers.get('Location')
1342 if location:
1343 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1344 location = location.encode('iso-8859-1').decode()
1345 location_escaped = escape_url(location)
1346 if location != location_escaped:
1347 del resp.headers['Location']
1348 resp.headers['Location'] = location_escaped
1349 return resp
1350
1351 https_request = http_request
1352 https_response = http_response
1353
1354
1355 def make_socks_conn_class(base_class, socks_proxy):
1356 assert issubclass(base_class, (
1357 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1358
1359 url_components = compat_urlparse.urlparse(socks_proxy)
1360 if url_components.scheme.lower() == 'socks5':
1361 socks_type = ProxyType.SOCKS5
1362 elif url_components.scheme.lower() in ('socks', 'socks4'):
1363 socks_type = ProxyType.SOCKS4
1364 elif url_components.scheme.lower() == 'socks4a':
1365 socks_type = ProxyType.SOCKS4A
1366
1367 def unquote_if_non_empty(s):
1368 if not s:
1369 return s
1370 return compat_urllib_parse_unquote_plus(s)
1371
1372 proxy_args = (
1373 socks_type,
1374 url_components.hostname, url_components.port or 1080,
1375 True, # Remote DNS
1376 unquote_if_non_empty(url_components.username),
1377 unquote_if_non_empty(url_components.password),
1378 )
1379
1380 class SocksConnection(base_class):
1381 def connect(self):
1382 self.sock = sockssocket()
1383 self.sock.setproxy(*proxy_args)
1384 if isinstance(self.timeout, (int, float)):
1385 self.sock.settimeout(self.timeout)
1386 self.sock.connect((self.host, self.port))
1387
1388 if isinstance(self, compat_http_client.HTTPSConnection):
1389 if hasattr(self, '_context'): # Python > 2.6
1390 self.sock = self._context.wrap_socket(
1391 self.sock, server_hostname=self.host)
1392 else:
1393 self.sock = ssl.wrap_socket(self.sock)
1394
1395 return SocksConnection
1396
1397
1398 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1399 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1400 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1401 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1402 self._params = params
1403
1404 def https_open(self, req):
1405 kwargs = {}
1406 conn_class = self._https_conn_class
1407
1408 if hasattr(self, '_context'): # python > 2.6
1409 kwargs['context'] = self._context
1410 if hasattr(self, '_check_hostname'): # python 3.x
1411 kwargs['check_hostname'] = self._check_hostname
1412
1413 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1414 if socks_proxy:
1415 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1416 del req.headers['Ytdl-socks-proxy']
1417
1418 try:
1419 return self.do_open(
1420 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1421 except urllib.error.URLError as e:
1422 if (isinstance(e.reason, ssl.SSLError)
1423 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1424 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1425 raise
1426
1427
1428 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1429 """
1430 See [1] for cookie file format.
1431
1432 1. https://curl.haxx.se/docs/http-cookies.html
1433 """
1434 _HTTPONLY_PREFIX = '#HttpOnly_'
1435 _ENTRY_LEN = 7
1436 _HEADER = '''# Netscape HTTP Cookie File
1437 # This file is generated by yt-dlp. Do not edit.
1438
1439 '''
1440 _CookieFileEntry = collections.namedtuple(
1441 'CookieFileEntry',
1442 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1443
1444 def __init__(self, filename=None, *args, **kwargs):
1445 super().__init__(None, *args, **kwargs)
1446 if self.is_path(filename):
1447 filename = os.fspath(filename)
1448 self.filename = filename
1449
1450 @staticmethod
1451 def _true_or_false(cndn):
1452 return 'TRUE' if cndn else 'FALSE'
1453
1454 @staticmethod
1455 def is_path(file):
1456 return isinstance(file, (str, bytes, os.PathLike))
1457
1458 @contextlib.contextmanager
1459 def open(self, file, *, write=False):
1460 if self.is_path(file):
1461 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1462 yield f
1463 else:
1464 if write:
1465 file.truncate(0)
1466 yield file
1467
1468 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1469 now = time.time()
1470 for cookie in self:
1471 if (not ignore_discard and cookie.discard
1472 or not ignore_expires and cookie.is_expired(now)):
1473 continue
1474 name, value = cookie.name, cookie.value
1475 if value is None:
1476 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1477 # with no name, whereas http.cookiejar regards it as a
1478 # cookie with no value.
1479 name, value = '', name
1480 f.write('%s\n' % '\t'.join((
1481 cookie.domain,
1482 self._true_or_false(cookie.domain.startswith('.')),
1483 cookie.path,
1484 self._true_or_false(cookie.secure),
1485 str_or_none(cookie.expires, default=''),
1486 name, value
1487 )))
1488
1489 def save(self, filename=None, *args, **kwargs):
1490 """
1491 Save cookies to a file.
1492 Code is taken from CPython 3.6
1493 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1494
1495 if filename is None:
1496 if self.filename is not None:
1497 filename = self.filename
1498 else:
1499 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1500
1501 # Store session cookies with `expires` set to 0 instead of an empty string
1502 for cookie in self:
1503 if cookie.expires is None:
1504 cookie.expires = 0
1505
1506 with self.open(filename, write=True) as f:
1507 f.write(self._HEADER)
1508 self._really_save(f, *args, **kwargs)
1509
1510 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1511 """Load cookies from a file."""
1512 if filename is None:
1513 if self.filename is not None:
1514 filename = self.filename
1515 else:
1516 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1517
1518 def prepare_line(line):
1519 if line.startswith(self._HTTPONLY_PREFIX):
1520 line = line[len(self._HTTPONLY_PREFIX):]
1521 # comments and empty lines are fine
1522 if line.startswith('#') or not line.strip():
1523 return line
1524 cookie_list = line.split('\t')
1525 if len(cookie_list) != self._ENTRY_LEN:
1526 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1527 cookie = self._CookieFileEntry(*cookie_list)
1528 if cookie.expires_at and not cookie.expires_at.isdigit():
1529 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1530 return line
1531
1532 cf = io.StringIO()
1533 with self.open(filename) as f:
1534 for line in f:
1535 try:
1536 cf.write(prepare_line(line))
1537 except compat_cookiejar.LoadError as e:
1538 if f'{line.strip()} '[0] in '[{"':
1539 raise compat_cookiejar.LoadError(
1540 'Cookies file must be Netscape formatted, not JSON. See '
1541 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1542 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1543 continue
1544 cf.seek(0)
1545 self._really_load(cf, filename, ignore_discard, ignore_expires)
1546 # Session cookies are denoted by either `expires` field set to
1547 # an empty string or 0. MozillaCookieJar only recognizes the former
1548 # (see [1]). So we need force the latter to be recognized as session
1549 # cookies on our own.
1550 # Session cookies may be important for cookies-based authentication,
1551 # e.g. usually, when user does not check 'Remember me' check box while
1552 # logging in on a site, some important cookies are stored as session
1553 # cookies so that not recognizing them will result in failed login.
1554 # 1. https://bugs.python.org/issue17164
1555 for cookie in self:
1556 # Treat `expires=0` cookies as session cookies
1557 if cookie.expires == 0:
1558 cookie.expires = None
1559 cookie.discard = True
1560
1561
1562 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1563 def __init__(self, cookiejar=None):
1564 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1565
1566 def http_response(self, request, response):
1567 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1568
1569 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1570 https_response = http_response
1571
1572
1573 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1574 """YoutubeDL redirect handler
1575
1576 The code is based on HTTPRedirectHandler implementation from CPython [1].
1577
1578 This redirect handler solves two issues:
1579 - ensures redirect URL is always unicode under python 2
1580 - introduces support for experimental HTTP response status code
1581 308 Permanent Redirect [2] used by some sites [3]
1582
1583 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1584 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1585 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1586 """
1587
1588 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1589
1590 def redirect_request(self, req, fp, code, msg, headers, newurl):
1591 """Return a Request or None in response to a redirect.
1592
1593 This is called by the http_error_30x methods when a
1594 redirection response is received. If a redirection should
1595 take place, return a new Request to allow http_error_30x to
1596 perform the redirect. Otherwise, raise HTTPError if no-one
1597 else should try to handle this url. Return None if you can't
1598 but another Handler might.
1599 """
1600 m = req.get_method()
1601 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1602 or code in (301, 302, 303) and m == "POST")):
1603 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1604 # Strictly (according to RFC 2616), 301 or 302 in response to
1605 # a POST MUST NOT cause a redirection without confirmation
1606 # from the user (of urllib.request, in this case). In practice,
1607 # essentially all clients do redirect in this case, so we do
1608 # the same.
1609
1610 # Be conciliant with URIs containing a space. This is mainly
1611 # redundant with the more complete encoding done in http_error_302(),
1612 # but it is kept for compatibility with other callers.
1613 newurl = newurl.replace(' ', '%20')
1614
1615 CONTENT_HEADERS = ("content-length", "content-type")
1616 # NB: don't use dict comprehension for python 2.6 compatibility
1617 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1618
1619 # A 303 must either use GET or HEAD for subsequent request
1620 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1621 if code == 303 and m != 'HEAD':
1622 m = 'GET'
1623 # 301 and 302 redirects are commonly turned into a GET from a POST
1624 # for subsequent requests by browsers, so we'll do the same.
1625 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1626 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1627 if code in (301, 302) and m == 'POST':
1628 m = 'GET'
1629
1630 return compat_urllib_request.Request(
1631 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1632 unverifiable=True, method=m)
1633
1634
1635 def extract_timezone(date_str):
1636 m = re.search(
1637 r'''(?x)
1638 ^.{8,}? # >=8 char non-TZ prefix, if present
1639 (?P<tz>Z| # just the UTC Z, or
1640 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1641 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1642 [ ]? # optional space
1643 (?P<sign>\+|-) # +/-
1644 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1645 $)
1646 ''', date_str)
1647 if not m:
1648 timezone = datetime.timedelta()
1649 else:
1650 date_str = date_str[:-len(m.group('tz'))]
1651 if not m.group('sign'):
1652 timezone = datetime.timedelta()
1653 else:
1654 sign = 1 if m.group('sign') == '+' else -1
1655 timezone = datetime.timedelta(
1656 hours=sign * int(m.group('hours')),
1657 minutes=sign * int(m.group('minutes')))
1658 return timezone, date_str
1659
1660
1661 def parse_iso8601(date_str, delimiter='T', timezone=None):
1662 """ Return a UNIX timestamp from the given date """
1663
1664 if date_str is None:
1665 return None
1666
1667 date_str = re.sub(r'\.[0-9]+', '', date_str)
1668
1669 if timezone is None:
1670 timezone, date_str = extract_timezone(date_str)
1671
1672 with contextlib.suppress(ValueError):
1673 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1674 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1675 return calendar.timegm(dt.timetuple())
1676
1677
1678 def date_formats(day_first=True):
1679 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1680
1681
1682 def unified_strdate(date_str, day_first=True):
1683 """Return a string with the date in the format YYYYMMDD"""
1684
1685 if date_str is None:
1686 return None
1687 upload_date = None
1688 # Replace commas
1689 date_str = date_str.replace(',', ' ')
1690 # Remove AM/PM + timezone
1691 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1692 _, date_str = extract_timezone(date_str)
1693
1694 for expression in date_formats(day_first):
1695 with contextlib.suppress(ValueError):
1696 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1697 if upload_date is None:
1698 timetuple = email.utils.parsedate_tz(date_str)
1699 if timetuple:
1700 with contextlib.suppress(ValueError):
1701 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1702 if upload_date is not None:
1703 return compat_str(upload_date)
1704
1705
1706 def unified_timestamp(date_str, day_first=True):
1707 if date_str is None:
1708 return None
1709
1710 date_str = re.sub(r'[,|]', '', date_str)
1711
1712 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1713 timezone, date_str = extract_timezone(date_str)
1714
1715 # Remove AM/PM + timezone
1716 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1717
1718 # Remove unrecognized timezones from ISO 8601 alike timestamps
1719 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1720 if m:
1721 date_str = date_str[:-len(m.group('tz'))]
1722
1723 # Python only supports microseconds, so remove nanoseconds
1724 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1725 if m:
1726 date_str = m.group(1)
1727
1728 for expression in date_formats(day_first):
1729 with contextlib.suppress(ValueError):
1730 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1731 return calendar.timegm(dt.timetuple())
1732 timetuple = email.utils.parsedate_tz(date_str)
1733 if timetuple:
1734 return calendar.timegm(timetuple) + pm_delta * 3600
1735
1736
1737 def determine_ext(url, default_ext='unknown_video'):
1738 if url is None or '.' not in url:
1739 return default_ext
1740 guess = url.partition('?')[0].rpartition('.')[2]
1741 if re.match(r'^[A-Za-z0-9]+$', guess):
1742 return guess
1743 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1744 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1745 return guess.rstrip('/')
1746 else:
1747 return default_ext
1748
1749
1750 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1751 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1752
1753
1754 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1755 R"""
1756 Return a datetime object from a string.
1757 Supported format:
1758 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1759
1760 @param format strftime format of DATE
1761 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1762 auto: round to the unit provided in date_str (if applicable).
1763 """
1764 auto_precision = False
1765 if precision == 'auto':
1766 auto_precision = True
1767 precision = 'microsecond'
1768 today = datetime_round(datetime.datetime.utcnow(), precision)
1769 if date_str in ('now', 'today'):
1770 return today
1771 if date_str == 'yesterday':
1772 return today - datetime.timedelta(days=1)
1773 match = re.match(
1774 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1775 date_str)
1776 if match is not None:
1777 start_time = datetime_from_str(match.group('start'), precision, format)
1778 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1779 unit = match.group('unit')
1780 if unit == 'month' or unit == 'year':
1781 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1782 unit = 'day'
1783 else:
1784 if unit == 'week':
1785 unit = 'day'
1786 time *= 7
1787 delta = datetime.timedelta(**{unit + 's': time})
1788 new_date = start_time + delta
1789 if auto_precision:
1790 return datetime_round(new_date, unit)
1791 return new_date
1792
1793 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1794
1795
1796 def date_from_str(date_str, format='%Y%m%d', strict=False):
1797 R"""
1798 Return a date object from a string using datetime_from_str
1799
1800 @param strict Restrict allowed patterns to "YYYYMMDD" and
1801 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1802 """
1803 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1804 raise ValueError(f'Invalid date format "{date_str}"')
1805 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1806
1807
1808 def datetime_add_months(dt, months):
1809 """Increment/Decrement a datetime object by months."""
1810 month = dt.month + months - 1
1811 year = dt.year + month // 12
1812 month = month % 12 + 1
1813 day = min(dt.day, calendar.monthrange(year, month)[1])
1814 return dt.replace(year, month, day)
1815
1816
1817 def datetime_round(dt, precision='day'):
1818 """
1819 Round a datetime object's time to a specific precision
1820 """
1821 if precision == 'microsecond':
1822 return dt
1823
1824 unit_seconds = {
1825 'day': 86400,
1826 'hour': 3600,
1827 'minute': 60,
1828 'second': 1,
1829 }
1830 roundto = lambda x, n: ((x + n / 2) // n) * n
1831 timestamp = calendar.timegm(dt.timetuple())
1832 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1833
1834
1835 def hyphenate_date(date_str):
1836 """
1837 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1838 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1839 if match is not None:
1840 return '-'.join(match.groups())
1841 else:
1842 return date_str
1843
1844
1845 class DateRange:
1846 """Represents a time interval between two dates"""
1847
1848 def __init__(self, start=None, end=None):
1849 """start and end must be strings in the format accepted by date"""
1850 if start is not None:
1851 self.start = date_from_str(start, strict=True)
1852 else:
1853 self.start = datetime.datetime.min.date()
1854 if end is not None:
1855 self.end = date_from_str(end, strict=True)
1856 else:
1857 self.end = datetime.datetime.max.date()
1858 if self.start > self.end:
1859 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1860
1861 @classmethod
1862 def day(cls, day):
1863 """Returns a range that only contains the given day"""
1864 return cls(day, day)
1865
1866 def __contains__(self, date):
1867 """Check if the date is in the range"""
1868 if not isinstance(date, datetime.date):
1869 date = date_from_str(date)
1870 return self.start <= date <= self.end
1871
1872 def __str__(self):
1873 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1874
1875
1876 def platform_name():
1877 """ Returns the platform name as a compat_str """
1878 res = platform.platform()
1879 if isinstance(res, bytes):
1880 res = res.decode(preferredencoding())
1881
1882 assert isinstance(res, compat_str)
1883 return res
1884
1885
1886 @functools.cache
1887 def get_windows_version():
1888 ''' Get Windows version. None if it's not running on Windows '''
1889 if compat_os_name == 'nt':
1890 return version_tuple(platform.win32_ver()[1])
1891 else:
1892 return None
1893
1894
1895 def write_string(s, out=None, encoding=None):
1896 assert isinstance(s, str)
1897 out = out or sys.stderr
1898
1899 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1900 s = re.sub(r'([\r\n]+)', r' \1', s)
1901
1902 enc = None
1903 if 'b' in getattr(out, 'mode', ''):
1904 enc = encoding or preferredencoding()
1905 elif hasattr(out, 'buffer'):
1906 out = out.buffer
1907 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1908
1909 out.write(s.encode(enc, 'ignore') if enc else s)
1910 out.flush()
1911
1912
1913 def bytes_to_intlist(bs):
1914 if not bs:
1915 return []
1916 if isinstance(bs[0], int): # Python 3
1917 return list(bs)
1918 else:
1919 return [ord(c) for c in bs]
1920
1921
1922 def intlist_to_bytes(xs):
1923 if not xs:
1924 return b''
1925 return compat_struct_pack('%dB' % len(xs), *xs)
1926
1927
1928 class LockingUnsupportedError(IOError):
1929 msg = 'File locking is not supported on this platform'
1930
1931 def __init__(self):
1932 super().__init__(self.msg)
1933
1934
1935 # Cross-platform file locking
1936 if sys.platform == 'win32':
1937 import ctypes.wintypes
1938 import msvcrt
1939
1940 class OVERLAPPED(ctypes.Structure):
1941 _fields_ = [
1942 ('Internal', ctypes.wintypes.LPVOID),
1943 ('InternalHigh', ctypes.wintypes.LPVOID),
1944 ('Offset', ctypes.wintypes.DWORD),
1945 ('OffsetHigh', ctypes.wintypes.DWORD),
1946 ('hEvent', ctypes.wintypes.HANDLE),
1947 ]
1948
1949 kernel32 = ctypes.windll.kernel32
1950 LockFileEx = kernel32.LockFileEx
1951 LockFileEx.argtypes = [
1952 ctypes.wintypes.HANDLE, # hFile
1953 ctypes.wintypes.DWORD, # dwFlags
1954 ctypes.wintypes.DWORD, # dwReserved
1955 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1956 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1957 ctypes.POINTER(OVERLAPPED) # Overlapped
1958 ]
1959 LockFileEx.restype = ctypes.wintypes.BOOL
1960 UnlockFileEx = kernel32.UnlockFileEx
1961 UnlockFileEx.argtypes = [
1962 ctypes.wintypes.HANDLE, # hFile
1963 ctypes.wintypes.DWORD, # dwReserved
1964 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1965 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1966 ctypes.POINTER(OVERLAPPED) # Overlapped
1967 ]
1968 UnlockFileEx.restype = ctypes.wintypes.BOOL
1969 whole_low = 0xffffffff
1970 whole_high = 0x7fffffff
1971
1972 def _lock_file(f, exclusive, block):
1973 overlapped = OVERLAPPED()
1974 overlapped.Offset = 0
1975 overlapped.OffsetHigh = 0
1976 overlapped.hEvent = 0
1977 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1978
1979 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1980 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1981 0, whole_low, whole_high, f._lock_file_overlapped_p):
1982 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
1983
1984 def _unlock_file(f):
1985 assert f._lock_file_overlapped_p
1986 handle = msvcrt.get_osfhandle(f.fileno())
1987 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1988 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1989
1990 else:
1991 try:
1992 import fcntl
1993
1994 def _lock_file(f, exclusive, block):
1995 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1996 if not block:
1997 flags |= fcntl.LOCK_NB
1998 try:
1999 fcntl.flock(f, flags)
2000 except BlockingIOError:
2001 raise
2002 except OSError: # AOSP does not have flock()
2003 fcntl.lockf(f, flags)
2004
2005 def _unlock_file(f):
2006 try:
2007 fcntl.flock(f, fcntl.LOCK_UN)
2008 except OSError:
2009 fcntl.lockf(f, fcntl.LOCK_UN)
2010
2011 except ImportError:
2012
2013 def _lock_file(f, exclusive, block):
2014 raise LockingUnsupportedError()
2015
2016 def _unlock_file(f):
2017 raise LockingUnsupportedError()
2018
2019
2020 class locked_file:
2021 locked = False
2022
2023 def __init__(self, filename, mode, block=True, encoding=None):
2024 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2025 raise NotImplementedError(mode)
2026 self.mode, self.block = mode, block
2027
2028 writable = any(f in mode for f in 'wax+')
2029 readable = any(f in mode for f in 'r+')
2030 flags = functools.reduce(operator.ior, (
2031 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2032 getattr(os, 'O_BINARY', 0), # Windows only
2033 getattr(os, 'O_NOINHERIT', 0), # Windows only
2034 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2035 os.O_APPEND if 'a' in mode else 0,
2036 os.O_EXCL if 'x' in mode else 0,
2037 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2038 ))
2039
2040 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2041
2042 def __enter__(self):
2043 exclusive = 'r' not in self.mode
2044 try:
2045 _lock_file(self.f, exclusive, self.block)
2046 self.locked = True
2047 except OSError:
2048 self.f.close()
2049 raise
2050 if 'w' in self.mode:
2051 try:
2052 self.f.truncate()
2053 except OSError as e:
2054 if e.errno != 29: # Illegal seek, expected when self.f is a FIFO
2055 raise e
2056 return self
2057
2058 def unlock(self):
2059 if not self.locked:
2060 return
2061 try:
2062 _unlock_file(self.f)
2063 finally:
2064 self.locked = False
2065
2066 def __exit__(self, *_):
2067 try:
2068 self.unlock()
2069 finally:
2070 self.f.close()
2071
2072 open = __enter__
2073 close = __exit__
2074
2075 def __getattr__(self, attr):
2076 return getattr(self.f, attr)
2077
2078 def __iter__(self):
2079 return iter(self.f)
2080
2081
2082 @functools.cache
2083 def get_filesystem_encoding():
2084 encoding = sys.getfilesystemencoding()
2085 return encoding if encoding is not None else 'utf-8'
2086
2087
2088 def shell_quote(args):
2089 quoted_args = []
2090 encoding = get_filesystem_encoding()
2091 for a in args:
2092 if isinstance(a, bytes):
2093 # We may get a filename encoded with 'encodeFilename'
2094 a = a.decode(encoding)
2095 quoted_args.append(compat_shlex_quote(a))
2096 return ' '.join(quoted_args)
2097
2098
2099 def smuggle_url(url, data):
2100 """ Pass additional data in a URL for internal use. """
2101
2102 url, idata = unsmuggle_url(url, {})
2103 data.update(idata)
2104 sdata = compat_urllib_parse_urlencode(
2105 {'__youtubedl_smuggle': json.dumps(data)})
2106 return url + '#' + sdata
2107
2108
2109 def unsmuggle_url(smug_url, default=None):
2110 if '#__youtubedl_smuggle' not in smug_url:
2111 return smug_url, default
2112 url, _, sdata = smug_url.rpartition('#')
2113 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2114 data = json.loads(jsond)
2115 return url, data
2116
2117
2118 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2119 """ Formats numbers with decimal sufixes like K, M, etc """
2120 num, factor = float_or_none(num), float(factor)
2121 if num is None or num < 0:
2122 return None
2123 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2124 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2125 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2126 if factor == 1024:
2127 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2128 converted = num / (factor ** exponent)
2129 return fmt % (converted, suffix)
2130
2131
2132 def format_bytes(bytes):
2133 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2134
2135
2136 def lookup_unit_table(unit_table, s):
2137 units_re = '|'.join(re.escape(u) for u in unit_table)
2138 m = re.match(
2139 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2140 if not m:
2141 return None
2142 num_str = m.group('num').replace(',', '.')
2143 mult = unit_table[m.group('unit')]
2144 return int(float(num_str) * mult)
2145
2146
2147 def parse_filesize(s):
2148 if s is None:
2149 return None
2150
2151 # The lower-case forms are of course incorrect and unofficial,
2152 # but we support those too
2153 _UNIT_TABLE = {
2154 'B': 1,
2155 'b': 1,
2156 'bytes': 1,
2157 'KiB': 1024,
2158 'KB': 1000,
2159 'kB': 1024,
2160 'Kb': 1000,
2161 'kb': 1000,
2162 'kilobytes': 1000,
2163 'kibibytes': 1024,
2164 'MiB': 1024 ** 2,
2165 'MB': 1000 ** 2,
2166 'mB': 1024 ** 2,
2167 'Mb': 1000 ** 2,
2168 'mb': 1000 ** 2,
2169 'megabytes': 1000 ** 2,
2170 'mebibytes': 1024 ** 2,
2171 'GiB': 1024 ** 3,
2172 'GB': 1000 ** 3,
2173 'gB': 1024 ** 3,
2174 'Gb': 1000 ** 3,
2175 'gb': 1000 ** 3,
2176 'gigabytes': 1000 ** 3,
2177 'gibibytes': 1024 ** 3,
2178 'TiB': 1024 ** 4,
2179 'TB': 1000 ** 4,
2180 'tB': 1024 ** 4,
2181 'Tb': 1000 ** 4,
2182 'tb': 1000 ** 4,
2183 'terabytes': 1000 ** 4,
2184 'tebibytes': 1024 ** 4,
2185 'PiB': 1024 ** 5,
2186 'PB': 1000 ** 5,
2187 'pB': 1024 ** 5,
2188 'Pb': 1000 ** 5,
2189 'pb': 1000 ** 5,
2190 'petabytes': 1000 ** 5,
2191 'pebibytes': 1024 ** 5,
2192 'EiB': 1024 ** 6,
2193 'EB': 1000 ** 6,
2194 'eB': 1024 ** 6,
2195 'Eb': 1000 ** 6,
2196 'eb': 1000 ** 6,
2197 'exabytes': 1000 ** 6,
2198 'exbibytes': 1024 ** 6,
2199 'ZiB': 1024 ** 7,
2200 'ZB': 1000 ** 7,
2201 'zB': 1024 ** 7,
2202 'Zb': 1000 ** 7,
2203 'zb': 1000 ** 7,
2204 'zettabytes': 1000 ** 7,
2205 'zebibytes': 1024 ** 7,
2206 'YiB': 1024 ** 8,
2207 'YB': 1000 ** 8,
2208 'yB': 1024 ** 8,
2209 'Yb': 1000 ** 8,
2210 'yb': 1000 ** 8,
2211 'yottabytes': 1000 ** 8,
2212 'yobibytes': 1024 ** 8,
2213 }
2214
2215 return lookup_unit_table(_UNIT_TABLE, s)
2216
2217
2218 def parse_count(s):
2219 if s is None:
2220 return None
2221
2222 s = re.sub(r'^[^\d]+\s', '', s).strip()
2223
2224 if re.match(r'^[\d,.]+$', s):
2225 return str_to_int(s)
2226
2227 _UNIT_TABLE = {
2228 'k': 1000,
2229 'K': 1000,
2230 'm': 1000 ** 2,
2231 'M': 1000 ** 2,
2232 'kk': 1000 ** 2,
2233 'KK': 1000 ** 2,
2234 'b': 1000 ** 3,
2235 'B': 1000 ** 3,
2236 }
2237
2238 ret = lookup_unit_table(_UNIT_TABLE, s)
2239 if ret is not None:
2240 return ret
2241
2242 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2243 if mobj:
2244 return str_to_int(mobj.group(1))
2245
2246
2247 def parse_resolution(s, *, lenient=False):
2248 if s is None:
2249 return {}
2250
2251 if lenient:
2252 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2253 else:
2254 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2255 if mobj:
2256 return {
2257 'width': int(mobj.group('w')),
2258 'height': int(mobj.group('h')),
2259 }
2260
2261 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2262 if mobj:
2263 return {'height': int(mobj.group(1))}
2264
2265 mobj = re.search(r'\b([48])[kK]\b', s)
2266 if mobj:
2267 return {'height': int(mobj.group(1)) * 540}
2268
2269 return {}
2270
2271
2272 def parse_bitrate(s):
2273 if not isinstance(s, compat_str):
2274 return
2275 mobj = re.search(r'\b(\d+)\s*kbps', s)
2276 if mobj:
2277 return int(mobj.group(1))
2278
2279
2280 def month_by_name(name, lang='en'):
2281 """ Return the number of a month by (locale-independently) English name """
2282
2283 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2284
2285 try:
2286 return month_names.index(name) + 1
2287 except ValueError:
2288 return None
2289
2290
2291 def month_by_abbreviation(abbrev):
2292 """ Return the number of a month by (locale-independently) English
2293 abbreviations """
2294
2295 try:
2296 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2297 except ValueError:
2298 return None
2299
2300
2301 def fix_xml_ampersands(xml_str):
2302 """Replace all the '&' by '&amp;' in XML"""
2303 return re.sub(
2304 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2305 '&amp;',
2306 xml_str)
2307
2308
2309 def setproctitle(title):
2310 assert isinstance(title, compat_str)
2311
2312 # ctypes in Jython is not complete
2313 # http://bugs.jython.org/issue2148
2314 if sys.platform.startswith('java'):
2315 return
2316
2317 try:
2318 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2319 except OSError:
2320 return
2321 except TypeError:
2322 # LoadLibrary in Windows Python 2.7.13 only expects
2323 # a bytestring, but since unicode_literals turns
2324 # every string into a unicode string, it fails.
2325 return
2326 title_bytes = title.encode()
2327 buf = ctypes.create_string_buffer(len(title_bytes))
2328 buf.value = title_bytes
2329 try:
2330 libc.prctl(15, buf, 0, 0, 0)
2331 except AttributeError:
2332 return # Strange libc, just skip this
2333
2334
2335 def remove_start(s, start):
2336 return s[len(start):] if s is not None and s.startswith(start) else s
2337
2338
2339 def remove_end(s, end):
2340 return s[:-len(end)] if s is not None and s.endswith(end) else s
2341
2342
2343 def remove_quotes(s):
2344 if s is None or len(s) < 2:
2345 return s
2346 for quote in ('"', "'", ):
2347 if s[0] == quote and s[-1] == quote:
2348 return s[1:-1]
2349 return s
2350
2351
2352 def get_domain(url):
2353 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2354 return domain.group('domain') if domain else None
2355
2356
2357 def url_basename(url):
2358 path = compat_urlparse.urlparse(url).path
2359 return path.strip('/').split('/')[-1]
2360
2361
2362 def base_url(url):
2363 return re.match(r'https?://[^?#&]+/', url).group()
2364
2365
2366 def urljoin(base, path):
2367 if isinstance(path, bytes):
2368 path = path.decode()
2369 if not isinstance(path, compat_str) or not path:
2370 return None
2371 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2372 return path
2373 if isinstance(base, bytes):
2374 base = base.decode()
2375 if not isinstance(base, compat_str) or not re.match(
2376 r'^(?:https?:)?//', base):
2377 return None
2378 return compat_urlparse.urljoin(base, path)
2379
2380
2381 class HEADRequest(compat_urllib_request.Request):
2382 def get_method(self):
2383 return 'HEAD'
2384
2385
2386 class PUTRequest(compat_urllib_request.Request):
2387 def get_method(self):
2388 return 'PUT'
2389
2390
2391 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2392 if get_attr and v is not None:
2393 v = getattr(v, get_attr, None)
2394 try:
2395 return int(v) * invscale // scale
2396 except (ValueError, TypeError, OverflowError):
2397 return default
2398
2399
2400 def str_or_none(v, default=None):
2401 return default if v is None else compat_str(v)
2402
2403
2404 def str_to_int(int_str):
2405 """ A more relaxed version of int_or_none """
2406 if isinstance(int_str, int):
2407 return int_str
2408 elif isinstance(int_str, compat_str):
2409 int_str = re.sub(r'[,\.\+]', '', int_str)
2410 return int_or_none(int_str)
2411
2412
2413 def float_or_none(v, scale=1, invscale=1, default=None):
2414 if v is None:
2415 return default
2416 try:
2417 return float(v) * invscale / scale
2418 except (ValueError, TypeError):
2419 return default
2420
2421
2422 def bool_or_none(v, default=None):
2423 return v if isinstance(v, bool) else default
2424
2425
2426 def strip_or_none(v, default=None):
2427 return v.strip() if isinstance(v, compat_str) else default
2428
2429
2430 def url_or_none(url):
2431 if not url or not isinstance(url, compat_str):
2432 return None
2433 url = url.strip()
2434 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2435
2436
2437 def request_to_url(req):
2438 if isinstance(req, compat_urllib_request.Request):
2439 return req.get_full_url()
2440 else:
2441 return req
2442
2443
2444 def strftime_or_none(timestamp, date_format, default=None):
2445 datetime_object = None
2446 try:
2447 if isinstance(timestamp, (int, float)): # unix timestamp
2448 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2449 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2450 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2451 return datetime_object.strftime(date_format)
2452 except (ValueError, TypeError, AttributeError):
2453 return default
2454
2455
2456 def parse_duration(s):
2457 if not isinstance(s, str):
2458 return None
2459 s = s.strip()
2460 if not s:
2461 return None
2462
2463 days, hours, mins, secs, ms = [None] * 5
2464 m = re.match(r'''(?x)
2465 (?P<before_secs>
2466 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2467 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2468 (?P<ms>[.:][0-9]+)?Z?$
2469 ''', s)
2470 if m:
2471 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2472 else:
2473 m = re.match(
2474 r'''(?ix)(?:P?
2475 (?:
2476 [0-9]+\s*y(?:ears?)?,?\s*
2477 )?
2478 (?:
2479 [0-9]+\s*m(?:onths?)?,?\s*
2480 )?
2481 (?:
2482 [0-9]+\s*w(?:eeks?)?,?\s*
2483 )?
2484 (?:
2485 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2486 )?
2487 T)?
2488 (?:
2489 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2490 )?
2491 (?:
2492 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2493 )?
2494 (?:
2495 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2496 )?Z?$''', s)
2497 if m:
2498 days, hours, mins, secs, ms = m.groups()
2499 else:
2500 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2501 if m:
2502 hours, mins = m.groups()
2503 else:
2504 return None
2505
2506 if ms:
2507 ms = ms.replace(':', '.')
2508 return sum(float(part or 0) * mult for part, mult in (
2509 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2510
2511
2512 def prepend_extension(filename, ext, expected_real_ext=None):
2513 name, real_ext = os.path.splitext(filename)
2514 return (
2515 f'{name}.{ext}{real_ext}'
2516 if not expected_real_ext or real_ext[1:] == expected_real_ext
2517 else f'{filename}.{ext}')
2518
2519
2520 def replace_extension(filename, ext, expected_real_ext=None):
2521 name, real_ext = os.path.splitext(filename)
2522 return '{}.{}'.format(
2523 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2524 ext)
2525
2526
2527 def check_executable(exe, args=[]):
2528 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2529 args can be a list of arguments for a short output (like -version) """
2530 try:
2531 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2532 except OSError:
2533 return False
2534 return exe
2535
2536
2537 def _get_exe_version_output(exe, args, *, to_screen=None):
2538 if to_screen:
2539 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2540 try:
2541 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2542 # SIGTTOU if yt-dlp is run in the background.
2543 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2544 out, _ = Popen(
2545 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2546 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2547 except OSError:
2548 return False
2549 if isinstance(out, bytes): # Python 2.x
2550 out = out.decode('ascii', 'ignore')
2551 return out
2552
2553
2554 def detect_exe_version(output, version_re=None, unrecognized='present'):
2555 assert isinstance(output, compat_str)
2556 if version_re is None:
2557 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2558 m = re.search(version_re, output)
2559 if m:
2560 return m.group(1)
2561 else:
2562 return unrecognized
2563
2564
2565 def get_exe_version(exe, args=['--version'],
2566 version_re=None, unrecognized='present'):
2567 """ Returns the version of the specified executable,
2568 or False if the executable is not present """
2569 out = _get_exe_version_output(exe, args)
2570 return detect_exe_version(out, version_re, unrecognized) if out else False
2571
2572
2573 class LazyList(collections.abc.Sequence):
2574 """Lazy immutable list from an iterable
2575 Note that slices of a LazyList are lists and not LazyList"""
2576
2577 class IndexError(IndexError):
2578 pass
2579
2580 def __init__(self, iterable, *, reverse=False, _cache=None):
2581 self._iterable = iter(iterable)
2582 self._cache = [] if _cache is None else _cache
2583 self._reversed = reverse
2584
2585 def __iter__(self):
2586 if self._reversed:
2587 # We need to consume the entire iterable to iterate in reverse
2588 yield from self.exhaust()
2589 return
2590 yield from self._cache
2591 for item in self._iterable:
2592 self._cache.append(item)
2593 yield item
2594
2595 def _exhaust(self):
2596 self._cache.extend(self._iterable)
2597 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2598 return self._cache
2599
2600 def exhaust(self):
2601 """Evaluate the entire iterable"""
2602 return self._exhaust()[::-1 if self._reversed else 1]
2603
2604 @staticmethod
2605 def _reverse_index(x):
2606 return None if x is None else -(x + 1)
2607
2608 def __getitem__(self, idx):
2609 if isinstance(idx, slice):
2610 if self._reversed:
2611 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2612 start, stop, step = idx.start, idx.stop, idx.step or 1
2613 elif isinstance(idx, int):
2614 if self._reversed:
2615 idx = self._reverse_index(idx)
2616 start, stop, step = idx, idx, 0
2617 else:
2618 raise TypeError('indices must be integers or slices')
2619 if ((start or 0) < 0 or (stop or 0) < 0
2620 or (start is None and step < 0)
2621 or (stop is None and step > 0)):
2622 # We need to consume the entire iterable to be able to slice from the end
2623 # Obviously, never use this with infinite iterables
2624 self._exhaust()
2625 try:
2626 return self._cache[idx]
2627 except IndexError as e:
2628 raise self.IndexError(e) from e
2629 n = max(start or 0, stop or 0) - len(self._cache) + 1
2630 if n > 0:
2631 self._cache.extend(itertools.islice(self._iterable, n))
2632 try:
2633 return self._cache[idx]
2634 except IndexError as e:
2635 raise self.IndexError(e) from e
2636
2637 def __bool__(self):
2638 try:
2639 self[-1] if self._reversed else self[0]
2640 except self.IndexError:
2641 return False
2642 return True
2643
2644 def __len__(self):
2645 self._exhaust()
2646 return len(self._cache)
2647
2648 def __reversed__(self):
2649 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2650
2651 def __copy__(self):
2652 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2653
2654 def __repr__(self):
2655 # repr and str should mimic a list. So we exhaust the iterable
2656 return repr(self.exhaust())
2657
2658 def __str__(self):
2659 return repr(self.exhaust())
2660
2661
2662 class PagedList:
2663
2664 class IndexError(IndexError):
2665 pass
2666
2667 def __len__(self):
2668 # This is only useful for tests
2669 return len(self.getslice())
2670
2671 def __init__(self, pagefunc, pagesize, use_cache=True):
2672 self._pagefunc = pagefunc
2673 self._pagesize = pagesize
2674 self._pagecount = float('inf')
2675 self._use_cache = use_cache
2676 self._cache = {}
2677
2678 def getpage(self, pagenum):
2679 page_results = self._cache.get(pagenum)
2680 if page_results is None:
2681 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2682 if self._use_cache:
2683 self._cache[pagenum] = page_results
2684 return page_results
2685
2686 def getslice(self, start=0, end=None):
2687 return list(self._getslice(start, end))
2688
2689 def _getslice(self, start, end):
2690 raise NotImplementedError('This method must be implemented by subclasses')
2691
2692 def __getitem__(self, idx):
2693 assert self._use_cache, 'Indexing PagedList requires cache'
2694 if not isinstance(idx, int) or idx < 0:
2695 raise TypeError('indices must be non-negative integers')
2696 entries = self.getslice(idx, idx + 1)
2697 if not entries:
2698 raise self.IndexError()
2699 return entries[0]
2700
2701
2702 class OnDemandPagedList(PagedList):
2703 """Download pages until a page with less than maximum results"""
2704
2705 def _getslice(self, start, end):
2706 for pagenum in itertools.count(start // self._pagesize):
2707 firstid = pagenum * self._pagesize
2708 nextfirstid = pagenum * self._pagesize + self._pagesize
2709 if start >= nextfirstid:
2710 continue
2711
2712 startv = (
2713 start % self._pagesize
2714 if firstid <= start < nextfirstid
2715 else 0)
2716 endv = (
2717 ((end - 1) % self._pagesize) + 1
2718 if (end is not None and firstid <= end <= nextfirstid)
2719 else None)
2720
2721 try:
2722 page_results = self.getpage(pagenum)
2723 except Exception:
2724 self._pagecount = pagenum - 1
2725 raise
2726 if startv != 0 or endv is not None:
2727 page_results = page_results[startv:endv]
2728 yield from page_results
2729
2730 # A little optimization - if current page is not "full", ie. does
2731 # not contain page_size videos then we can assume that this page
2732 # is the last one - there are no more ids on further pages -
2733 # i.e. no need to query again.
2734 if len(page_results) + startv < self._pagesize:
2735 break
2736
2737 # If we got the whole page, but the next page is not interesting,
2738 # break out early as well
2739 if end == nextfirstid:
2740 break
2741
2742
2743 class InAdvancePagedList(PagedList):
2744 """PagedList with total number of pages known in advance"""
2745
2746 def __init__(self, pagefunc, pagecount, pagesize):
2747 PagedList.__init__(self, pagefunc, pagesize, True)
2748 self._pagecount = pagecount
2749
2750 def _getslice(self, start, end):
2751 start_page = start // self._pagesize
2752 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2753 skip_elems = start - start_page * self._pagesize
2754 only_more = None if end is None else end - start
2755 for pagenum in range(start_page, end_page):
2756 page_results = self.getpage(pagenum)
2757 if skip_elems:
2758 page_results = page_results[skip_elems:]
2759 skip_elems = None
2760 if only_more is not None:
2761 if len(page_results) < only_more:
2762 only_more -= len(page_results)
2763 else:
2764 yield from page_results[:only_more]
2765 break
2766 yield from page_results
2767
2768
2769 def uppercase_escape(s):
2770 unicode_escape = codecs.getdecoder('unicode_escape')
2771 return re.sub(
2772 r'\\U[0-9a-fA-F]{8}',
2773 lambda m: unicode_escape(m.group(0))[0],
2774 s)
2775
2776
2777 def lowercase_escape(s):
2778 unicode_escape = codecs.getdecoder('unicode_escape')
2779 return re.sub(
2780 r'\\u[0-9a-fA-F]{4}',
2781 lambda m: unicode_escape(m.group(0))[0],
2782 s)
2783
2784
2785 def escape_rfc3986(s):
2786 """Escape non-ASCII characters as suggested by RFC 3986"""
2787 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2788
2789
2790 def escape_url(url):
2791 """Escape URL as suggested by RFC 3986"""
2792 url_parsed = compat_urllib_parse_urlparse(url)
2793 return url_parsed._replace(
2794 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2795 path=escape_rfc3986(url_parsed.path),
2796 params=escape_rfc3986(url_parsed.params),
2797 query=escape_rfc3986(url_parsed.query),
2798 fragment=escape_rfc3986(url_parsed.fragment)
2799 ).geturl()
2800
2801
2802 def parse_qs(url):
2803 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2804
2805
2806 def read_batch_urls(batch_fd):
2807 def fixup(url):
2808 if not isinstance(url, compat_str):
2809 url = url.decode('utf-8', 'replace')
2810 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2811 for bom in BOM_UTF8:
2812 if url.startswith(bom):
2813 url = url[len(bom):]
2814 url = url.lstrip()
2815 if not url or url.startswith(('#', ';', ']')):
2816 return False
2817 # "#" cannot be stripped out since it is part of the URI
2818 # However, it can be safely stipped out if follwing a whitespace
2819 return re.split(r'\s#', url, 1)[0].rstrip()
2820
2821 with contextlib.closing(batch_fd) as fd:
2822 return [url for url in map(fixup, fd) if url]
2823
2824
2825 def urlencode_postdata(*args, **kargs):
2826 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2827
2828
2829 def update_url_query(url, query):
2830 if not query:
2831 return url
2832 parsed_url = compat_urlparse.urlparse(url)
2833 qs = compat_parse_qs(parsed_url.query)
2834 qs.update(query)
2835 return compat_urlparse.urlunparse(parsed_url._replace(
2836 query=compat_urllib_parse_urlencode(qs, True)))
2837
2838
2839 def update_Request(req, url=None, data=None, headers={}, query={}):
2840 req_headers = req.headers.copy()
2841 req_headers.update(headers)
2842 req_data = data or req.data
2843 req_url = update_url_query(url or req.get_full_url(), query)
2844 req_get_method = req.get_method()
2845 if req_get_method == 'HEAD':
2846 req_type = HEADRequest
2847 elif req_get_method == 'PUT':
2848 req_type = PUTRequest
2849 else:
2850 req_type = compat_urllib_request.Request
2851 new_req = req_type(
2852 req_url, data=req_data, headers=req_headers,
2853 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2854 if hasattr(req, 'timeout'):
2855 new_req.timeout = req.timeout
2856 return new_req
2857
2858
2859 def _multipart_encode_impl(data, boundary):
2860 content_type = 'multipart/form-data; boundary=%s' % boundary
2861
2862 out = b''
2863 for k, v in data.items():
2864 out += b'--' + boundary.encode('ascii') + b'\r\n'
2865 if isinstance(k, compat_str):
2866 k = k.encode()
2867 if isinstance(v, compat_str):
2868 v = v.encode()
2869 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2870 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2871 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2872 if boundary.encode('ascii') in content:
2873 raise ValueError('Boundary overlaps with data')
2874 out += content
2875
2876 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2877
2878 return out, content_type
2879
2880
2881 def multipart_encode(data, boundary=None):
2882 '''
2883 Encode a dict to RFC 7578-compliant form-data
2884
2885 data:
2886 A dict where keys and values can be either Unicode or bytes-like
2887 objects.
2888 boundary:
2889 If specified a Unicode object, it's used as the boundary. Otherwise
2890 a random boundary is generated.
2891
2892 Reference: https://tools.ietf.org/html/rfc7578
2893 '''
2894 has_specified_boundary = boundary is not None
2895
2896 while True:
2897 if boundary is None:
2898 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2899
2900 try:
2901 out, content_type = _multipart_encode_impl(data, boundary)
2902 break
2903 except ValueError:
2904 if has_specified_boundary:
2905 raise
2906 boundary = None
2907
2908 return out, content_type
2909
2910
2911 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2912 for val in map(d.get, variadic(key_or_keys)):
2913 if val is not None and (val or not skip_false_values):
2914 return val
2915 return default
2916
2917
2918 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2919 for f in funcs:
2920 try:
2921 val = f(*args, **kwargs)
2922 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2923 pass
2924 else:
2925 if expected_type is None or isinstance(val, expected_type):
2926 return val
2927
2928
2929 def try_get(src, getter, expected_type=None):
2930 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2931
2932
2933 def filter_dict(dct, cndn=lambda _, v: v is not None):
2934 return {k: v for k, v in dct.items() if cndn(k, v)}
2935
2936
2937 def merge_dicts(*dicts):
2938 merged = {}
2939 for a_dict in dicts:
2940 for k, v in a_dict.items():
2941 if (v is not None and k not in merged
2942 or isinstance(v, str) and merged[k] == ''):
2943 merged[k] = v
2944 return merged
2945
2946
2947 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2948 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2949
2950
2951 US_RATINGS = {
2952 'G': 0,
2953 'PG': 10,
2954 'PG-13': 13,
2955 'R': 16,
2956 'NC': 18,
2957 }
2958
2959
2960 TV_PARENTAL_GUIDELINES = {
2961 'TV-Y': 0,
2962 'TV-Y7': 7,
2963 'TV-G': 0,
2964 'TV-PG': 0,
2965 'TV-14': 14,
2966 'TV-MA': 17,
2967 }
2968
2969
2970 def parse_age_limit(s):
2971 # isinstance(False, int) is True. So type() must be used instead
2972 if type(s) is int: # noqa: E721
2973 return s if 0 <= s <= 21 else None
2974 elif not isinstance(s, str):
2975 return None
2976 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2977 if m:
2978 return int(m.group('age'))
2979 s = s.upper()
2980 if s in US_RATINGS:
2981 return US_RATINGS[s]
2982 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2983 if m:
2984 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2985 return None
2986
2987
2988 def strip_jsonp(code):
2989 return re.sub(
2990 r'''(?sx)^
2991 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2992 (?:\s*&&\s*(?P=func_name))?
2993 \s*\(\s*(?P<callback_data>.*)\);?
2994 \s*?(?://[^\n]*)*$''',
2995 r'\g<callback_data>', code)
2996
2997
2998 def js_to_json(code, vars={}):
2999 # vars is a dict of var, val pairs to substitute
3000 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3001 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3002 INTEGER_TABLE = (
3003 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3004 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3005 )
3006
3007 def fix_kv(m):
3008 v = m.group(0)
3009 if v in ('true', 'false', 'null'):
3010 return v
3011 elif v in ('undefined', 'void 0'):
3012 return 'null'
3013 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3014 return ""
3015
3016 if v[0] in ("'", '"'):
3017 v = re.sub(r'(?s)\\.|"', lambda m: {
3018 '"': '\\"',
3019 "\\'": "'",
3020 '\\\n': '',
3021 '\\x': '\\u00',
3022 }.get(m.group(0), m.group(0)), v[1:-1])
3023 else:
3024 for regex, base in INTEGER_TABLE:
3025 im = re.match(regex, v)
3026 if im:
3027 i = int(im.group(1), base)
3028 return '"%d":' % i if v.endswith(':') else '%d' % i
3029
3030 if v in vars:
3031 return vars[v]
3032
3033 return '"%s"' % v
3034
3035 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3036
3037 return re.sub(r'''(?sx)
3038 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3039 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3040 {comment}|,(?={skip}[\]}}])|
3041 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3042 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3043 [0-9]+(?={skip}:)|
3044 !+
3045 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3046
3047
3048 def qualities(quality_ids):
3049 """ Get a numeric quality value out of a list of possible values """
3050 def q(qid):
3051 try:
3052 return quality_ids.index(qid)
3053 except ValueError:
3054 return -1
3055 return q
3056
3057
3058 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3059
3060
3061 DEFAULT_OUTTMPL = {
3062 'default': '%(title)s [%(id)s].%(ext)s',
3063 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3064 }
3065 OUTTMPL_TYPES = {
3066 'chapter': None,
3067 'subtitle': None,
3068 'thumbnail': None,
3069 'description': 'description',
3070 'annotation': 'annotations.xml',
3071 'infojson': 'info.json',
3072 'link': None,
3073 'pl_video': None,
3074 'pl_thumbnail': None,
3075 'pl_description': 'description',
3076 'pl_infojson': 'info.json',
3077 }
3078
3079 # As of [1] format syntax is:
3080 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3081 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3082 STR_FORMAT_RE_TMPL = r'''(?x)
3083 (?<!%)(?P<prefix>(?:%%)*)
3084 %
3085 (?P<has_key>\((?P<key>{0})\))?
3086 (?P<format>
3087 (?P<conversion>[#0\-+ ]+)?
3088 (?P<min_width>\d+)?
3089 (?P<precision>\.\d+)?
3090 (?P<len_mod>[hlL])? # unused in python
3091 {1} # conversion type
3092 )
3093 '''
3094
3095
3096 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3097
3098
3099 def limit_length(s, length):
3100 """ Add ellipses to overly long strings """
3101 if s is None:
3102 return None
3103 ELLIPSES = '...'
3104 if len(s) > length:
3105 return s[:length - len(ELLIPSES)] + ELLIPSES
3106 return s
3107
3108
3109 def version_tuple(v):
3110 return tuple(int(e) for e in re.split(r'[-.]', v))
3111
3112
3113 def is_outdated_version(version, limit, assume_new=True):
3114 if not version:
3115 return not assume_new
3116 try:
3117 return version_tuple(version) < version_tuple(limit)
3118 except ValueError:
3119 return not assume_new
3120
3121
3122 def ytdl_is_updateable():
3123 """ Returns if yt-dlp can be updated with -U """
3124
3125 from .update import is_non_updateable
3126
3127 return not is_non_updateable()
3128
3129
3130 def args_to_str(args):
3131 # Get a short string representation for a subprocess command
3132 return ' '.join(compat_shlex_quote(a) for a in args)
3133
3134
3135 def error_to_compat_str(err):
3136 return str(err)
3137
3138
3139 def error_to_str(err):
3140 return f'{type(err).__name__}: {err}'
3141
3142
3143 def mimetype2ext(mt):
3144 if mt is None:
3145 return None
3146
3147 mt, _, params = mt.partition(';')
3148 mt = mt.strip()
3149
3150 FULL_MAP = {
3151 'audio/mp4': 'm4a',
3152 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3153 # it's the most popular one
3154 'audio/mpeg': 'mp3',
3155 'audio/x-wav': 'wav',
3156 'audio/wav': 'wav',
3157 'audio/wave': 'wav',
3158 }
3159
3160 ext = FULL_MAP.get(mt)
3161 if ext is not None:
3162 return ext
3163
3164 SUBTYPE_MAP = {
3165 '3gpp': '3gp',
3166 'smptett+xml': 'tt',
3167 'ttaf+xml': 'dfxp',
3168 'ttml+xml': 'ttml',
3169 'x-flv': 'flv',
3170 'x-mp4-fragmented': 'mp4',
3171 'x-ms-sami': 'sami',
3172 'x-ms-wmv': 'wmv',
3173 'mpegurl': 'm3u8',
3174 'x-mpegurl': 'm3u8',
3175 'vnd.apple.mpegurl': 'm3u8',
3176 'dash+xml': 'mpd',
3177 'f4m+xml': 'f4m',
3178 'hds+xml': 'f4m',
3179 'vnd.ms-sstr+xml': 'ism',
3180 'quicktime': 'mov',
3181 'mp2t': 'ts',
3182 'x-wav': 'wav',
3183 'filmstrip+json': 'fs',
3184 'svg+xml': 'svg',
3185 }
3186
3187 _, _, subtype = mt.rpartition('/')
3188 ext = SUBTYPE_MAP.get(subtype.lower())
3189 if ext is not None:
3190 return ext
3191
3192 SUFFIX_MAP = {
3193 'json': 'json',
3194 'xml': 'xml',
3195 'zip': 'zip',
3196 'gzip': 'gz',
3197 }
3198
3199 _, _, suffix = subtype.partition('+')
3200 ext = SUFFIX_MAP.get(suffix)
3201 if ext is not None:
3202 return ext
3203
3204 return subtype.replace('+', '.')
3205
3206
3207 def ext2mimetype(ext_or_url):
3208 if not ext_or_url:
3209 return None
3210 if '.' not in ext_or_url:
3211 ext_or_url = f'file.{ext_or_url}'
3212 return mimetypes.guess_type(ext_or_url)[0]
3213
3214
3215 def parse_codecs(codecs_str):
3216 # http://tools.ietf.org/html/rfc6381
3217 if not codecs_str:
3218 return {}
3219 split_codecs = list(filter(None, map(
3220 str.strip, codecs_str.strip().strip(',').split(','))))
3221 vcodec, acodec, scodec, hdr = None, None, None, None
3222 for full_codec in split_codecs:
3223 parts = full_codec.split('.')
3224 codec = parts[0].replace('0', '')
3225 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3226 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3227 if not vcodec:
3228 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3229 if codec in ('dvh1', 'dvhe'):
3230 hdr = 'DV'
3231 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3232 hdr = 'HDR10'
3233 elif full_codec.replace('0', '').startswith('vp9.2'):
3234 hdr = 'HDR10'
3235 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3236 if not acodec:
3237 acodec = full_codec
3238 elif codec in ('stpp', 'wvtt',):
3239 if not scodec:
3240 scodec = full_codec
3241 else:
3242 write_string(f'WARNING: Unknown codec {full_codec}\n')
3243 if vcodec or acodec or scodec:
3244 return {
3245 'vcodec': vcodec or 'none',
3246 'acodec': acodec or 'none',
3247 'dynamic_range': hdr,
3248 **({'scodec': scodec} if scodec is not None else {}),
3249 }
3250 elif len(split_codecs) == 2:
3251 return {
3252 'vcodec': split_codecs[0],
3253 'acodec': split_codecs[1],
3254 }
3255 return {}
3256
3257
3258 def urlhandle_detect_ext(url_handle):
3259 getheader = url_handle.headers.get
3260
3261 cd = getheader('Content-Disposition')
3262 if cd:
3263 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3264 if m:
3265 e = determine_ext(m.group('filename'), default_ext=None)
3266 if e:
3267 return e
3268
3269 return mimetype2ext(getheader('Content-Type'))
3270
3271
3272 def encode_data_uri(data, mime_type):
3273 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3274
3275
3276 def age_restricted(content_limit, age_limit):
3277 """ Returns True iff the content should be blocked """
3278
3279 if age_limit is None: # No limit set
3280 return False
3281 if content_limit is None:
3282 return False # Content available for everyone
3283 return age_limit < content_limit
3284
3285
3286 def is_html(first_bytes):
3287 """ Detect whether a file contains HTML by examining its first bytes. """
3288
3289 BOMS = [
3290 (b'\xef\xbb\xbf', 'utf-8'),
3291 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3292 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3293 (b'\xff\xfe', 'utf-16-le'),
3294 (b'\xfe\xff', 'utf-16-be'),
3295 ]
3296
3297 encoding = 'utf-8'
3298 for bom, enc in BOMS:
3299 while first_bytes.startswith(bom):
3300 encoding, first_bytes = enc, first_bytes[len(bom):]
3301
3302 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3303
3304
3305 def determine_protocol(info_dict):
3306 protocol = info_dict.get('protocol')
3307 if protocol is not None:
3308 return protocol
3309
3310 url = sanitize_url(info_dict['url'])
3311 if url.startswith('rtmp'):
3312 return 'rtmp'
3313 elif url.startswith('mms'):
3314 return 'mms'
3315 elif url.startswith('rtsp'):
3316 return 'rtsp'
3317
3318 ext = determine_ext(url)
3319 if ext == 'm3u8':
3320 return 'm3u8'
3321 elif ext == 'f4m':
3322 return 'f4m'
3323
3324 return compat_urllib_parse_urlparse(url).scheme
3325
3326
3327 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3328 """ Render a list of rows, each as a list of values.
3329 Text after a \t will be right aligned """
3330 def width(string):
3331 return len(remove_terminal_sequences(string).replace('\t', ''))
3332
3333 def get_max_lens(table):
3334 return [max(width(str(v)) for v in col) for col in zip(*table)]
3335
3336 def filter_using_list(row, filterArray):
3337 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3338
3339 max_lens = get_max_lens(data) if hide_empty else []
3340 header_row = filter_using_list(header_row, max_lens)
3341 data = [filter_using_list(row, max_lens) for row in data]
3342
3343 table = [header_row] + data
3344 max_lens = get_max_lens(table)
3345 extra_gap += 1
3346 if delim:
3347 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3348 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3349 for row in table:
3350 for pos, text in enumerate(map(str, row)):
3351 if '\t' in text:
3352 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3353 else:
3354 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3355 ret = '\n'.join(''.join(row).rstrip() for row in table)
3356 return ret
3357
3358
3359 def _match_one(filter_part, dct, incomplete):
3360 # TODO: Generalize code with YoutubeDL._build_format_filter
3361 STRING_OPERATORS = {
3362 '*=': operator.contains,
3363 '^=': lambda attr, value: attr.startswith(value),
3364 '$=': lambda attr, value: attr.endswith(value),
3365 '~=': lambda attr, value: re.search(value, attr),
3366 }
3367 COMPARISON_OPERATORS = {
3368 **STRING_OPERATORS,
3369 '<=': operator.le, # "<=" must be defined above "<"
3370 '<': operator.lt,
3371 '>=': operator.ge,
3372 '>': operator.gt,
3373 '=': operator.eq,
3374 }
3375
3376 if isinstance(incomplete, bool):
3377 is_incomplete = lambda _: incomplete
3378 else:
3379 is_incomplete = lambda k: k in incomplete
3380
3381 operator_rex = re.compile(r'''(?x)\s*
3382 (?P<key>[a-z_]+)
3383 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3384 (?:
3385 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3386 (?P<strval>.+?)
3387 )
3388 \s*$
3389 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3390 m = operator_rex.search(filter_part)
3391 if m:
3392 m = m.groupdict()
3393 unnegated_op = COMPARISON_OPERATORS[m['op']]
3394 if m['negation']:
3395 op = lambda attr, value: not unnegated_op(attr, value)
3396 else:
3397 op = unnegated_op
3398 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3399 if m['quote']:
3400 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3401 actual_value = dct.get(m['key'])
3402 numeric_comparison = None
3403 if isinstance(actual_value, (int, float)):
3404 # If the original field is a string and matching comparisonvalue is
3405 # a number we should respect the origin of the original field
3406 # and process comparison value as a string (see
3407 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3408 try:
3409 numeric_comparison = int(comparison_value)
3410 except ValueError:
3411 numeric_comparison = parse_filesize(comparison_value)
3412 if numeric_comparison is None:
3413 numeric_comparison = parse_filesize(f'{comparison_value}B')
3414 if numeric_comparison is None:
3415 numeric_comparison = parse_duration(comparison_value)
3416 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3417 raise ValueError('Operator %s only supports string values!' % m['op'])
3418 if actual_value is None:
3419 return is_incomplete(m['key']) or m['none_inclusive']
3420 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3421
3422 UNARY_OPERATORS = {
3423 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3424 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3425 }
3426 operator_rex = re.compile(r'''(?x)\s*
3427 (?P<op>%s)\s*(?P<key>[a-z_]+)
3428 \s*$
3429 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3430 m = operator_rex.search(filter_part)
3431 if m:
3432 op = UNARY_OPERATORS[m.group('op')]
3433 actual_value = dct.get(m.group('key'))
3434 if is_incomplete(m.group('key')) and actual_value is None:
3435 return True
3436 return op(actual_value)
3437
3438 raise ValueError('Invalid filter part %r' % filter_part)
3439
3440
3441 def match_str(filter_str, dct, incomplete=False):
3442 """ Filter a dictionary with a simple string syntax.
3443 @returns Whether the filter passes
3444 @param incomplete Set of keys that is expected to be missing from dct.
3445 Can be True/False to indicate all/none of the keys may be missing.
3446 All conditions on incomplete keys pass if the key is missing
3447 """
3448 return all(
3449 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3450 for filter_part in re.split(r'(?<!\\)&', filter_str))
3451
3452
3453 def match_filter_func(filters):
3454 if not filters:
3455 return None
3456 filters = set(variadic(filters))
3457
3458 interactive = '-' in filters
3459 if interactive:
3460 filters.remove('-')
3461
3462 def _match_func(info_dict, incomplete=False):
3463 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3464 return NO_DEFAULT if interactive and not incomplete else None
3465 else:
3466 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3467 filter_str = ') | ('.join(map(str.strip, filters))
3468 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3469 return _match_func
3470
3471
3472 def parse_dfxp_time_expr(time_expr):
3473 if not time_expr:
3474 return
3475
3476 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3477 if mobj:
3478 return float(mobj.group('time_offset'))
3479
3480 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3481 if mobj:
3482 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3483
3484
3485 def srt_subtitles_timecode(seconds):
3486 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3487
3488
3489 def ass_subtitles_timecode(seconds):
3490 time = timetuple_from_msec(seconds * 1000)
3491 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3492
3493
3494 def dfxp2srt(dfxp_data):
3495 '''
3496 @param dfxp_data A bytes-like object containing DFXP data
3497 @returns A unicode object containing converted SRT data
3498 '''
3499 LEGACY_NAMESPACES = (
3500 (b'http://www.w3.org/ns/ttml', [
3501 b'http://www.w3.org/2004/11/ttaf1',
3502 b'http://www.w3.org/2006/04/ttaf1',
3503 b'http://www.w3.org/2006/10/ttaf1',
3504 ]),
3505 (b'http://www.w3.org/ns/ttml#styling', [
3506 b'http://www.w3.org/ns/ttml#style',
3507 ]),
3508 )
3509
3510 SUPPORTED_STYLING = [
3511 'color',
3512 'fontFamily',
3513 'fontSize',
3514 'fontStyle',
3515 'fontWeight',
3516 'textDecoration'
3517 ]
3518
3519 _x = functools.partial(xpath_with_ns, ns_map={
3520 'xml': 'http://www.w3.org/XML/1998/namespace',
3521 'ttml': 'http://www.w3.org/ns/ttml',
3522 'tts': 'http://www.w3.org/ns/ttml#styling',
3523 })
3524
3525 styles = {}
3526 default_style = {}
3527
3528 class TTMLPElementParser:
3529 _out = ''
3530 _unclosed_elements = []
3531 _applied_styles = []
3532
3533 def start(self, tag, attrib):
3534 if tag in (_x('ttml:br'), 'br'):
3535 self._out += '\n'
3536 else:
3537 unclosed_elements = []
3538 style = {}
3539 element_style_id = attrib.get('style')
3540 if default_style:
3541 style.update(default_style)
3542 if element_style_id:
3543 style.update(styles.get(element_style_id, {}))
3544 for prop in SUPPORTED_STYLING:
3545 prop_val = attrib.get(_x('tts:' + prop))
3546 if prop_val:
3547 style[prop] = prop_val
3548 if style:
3549 font = ''
3550 for k, v in sorted(style.items()):
3551 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3552 continue
3553 if k == 'color':
3554 font += ' color="%s"' % v
3555 elif k == 'fontSize':
3556 font += ' size="%s"' % v
3557 elif k == 'fontFamily':
3558 font += ' face="%s"' % v
3559 elif k == 'fontWeight' and v == 'bold':
3560 self._out += '<b>'
3561 unclosed_elements.append('b')
3562 elif k == 'fontStyle' and v == 'italic':
3563 self._out += '<i>'
3564 unclosed_elements.append('i')
3565 elif k == 'textDecoration' and v == 'underline':
3566 self._out += '<u>'
3567 unclosed_elements.append('u')
3568 if font:
3569 self._out += '<font' + font + '>'
3570 unclosed_elements.append('font')
3571 applied_style = {}
3572 if self._applied_styles:
3573 applied_style.update(self._applied_styles[-1])
3574 applied_style.update(style)
3575 self._applied_styles.append(applied_style)
3576 self._unclosed_elements.append(unclosed_elements)
3577
3578 def end(self, tag):
3579 if tag not in (_x('ttml:br'), 'br'):
3580 unclosed_elements = self._unclosed_elements.pop()
3581 for element in reversed(unclosed_elements):
3582 self._out += '</%s>' % element
3583 if unclosed_elements and self._applied_styles:
3584 self._applied_styles.pop()
3585
3586 def data(self, data):
3587 self._out += data
3588
3589 def close(self):
3590 return self._out.strip()
3591
3592 def parse_node(node):
3593 target = TTMLPElementParser()
3594 parser = xml.etree.ElementTree.XMLParser(target=target)
3595 parser.feed(xml.etree.ElementTree.tostring(node))
3596 return parser.close()
3597
3598 for k, v in LEGACY_NAMESPACES:
3599 for ns in v:
3600 dfxp_data = dfxp_data.replace(ns, k)
3601
3602 dfxp = compat_etree_fromstring(dfxp_data)
3603 out = []
3604 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3605
3606 if not paras:
3607 raise ValueError('Invalid dfxp/TTML subtitle')
3608
3609 repeat = False
3610 while True:
3611 for style in dfxp.findall(_x('.//ttml:style')):
3612 style_id = style.get('id') or style.get(_x('xml:id'))
3613 if not style_id:
3614 continue
3615 parent_style_id = style.get('style')
3616 if parent_style_id:
3617 if parent_style_id not in styles:
3618 repeat = True
3619 continue
3620 styles[style_id] = styles[parent_style_id].copy()
3621 for prop in SUPPORTED_STYLING:
3622 prop_val = style.get(_x('tts:' + prop))
3623 if prop_val:
3624 styles.setdefault(style_id, {})[prop] = prop_val
3625 if repeat:
3626 repeat = False
3627 else:
3628 break
3629
3630 for p in ('body', 'div'):
3631 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3632 if ele is None:
3633 continue
3634 style = styles.get(ele.get('style'))
3635 if not style:
3636 continue
3637 default_style.update(style)
3638
3639 for para, index in zip(paras, itertools.count(1)):
3640 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3641 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3642 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3643 if begin_time is None:
3644 continue
3645 if not end_time:
3646 if not dur:
3647 continue
3648 end_time = begin_time + dur
3649 out.append('%d\n%s --> %s\n%s\n\n' % (
3650 index,
3651 srt_subtitles_timecode(begin_time),
3652 srt_subtitles_timecode(end_time),
3653 parse_node(para)))
3654
3655 return ''.join(out)
3656
3657
3658 def cli_option(params, command_option, param, separator=None):
3659 param = params.get(param)
3660 return ([] if param is None
3661 else [command_option, str(param)] if separator is None
3662 else [f'{command_option}{separator}{param}'])
3663
3664
3665 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3666 param = params.get(param)
3667 assert param in (True, False, None)
3668 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3669
3670
3671 def cli_valueless_option(params, command_option, param, expected_value=True):
3672 return [command_option] if params.get(param) == expected_value else []
3673
3674
3675 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3676 if isinstance(argdict, (list, tuple)): # for backward compatibility
3677 if use_compat:
3678 return argdict
3679 else:
3680 argdict = None
3681 if argdict is None:
3682 return default
3683 assert isinstance(argdict, dict)
3684
3685 assert isinstance(keys, (list, tuple))
3686 for key_list in keys:
3687 arg_list = list(filter(
3688 lambda x: x is not None,
3689 [argdict.get(key.lower()) for key in variadic(key_list)]))
3690 if arg_list:
3691 return [arg for args in arg_list for arg in args]
3692 return default
3693
3694
3695 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3696 main_key, exe = main_key.lower(), exe.lower()
3697 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3698 keys = [f'{root_key}{k}' for k in (keys or [''])]
3699 if root_key in keys:
3700 if main_key != exe:
3701 keys.append((main_key, exe))
3702 keys.append('default')
3703 else:
3704 use_compat = False
3705 return cli_configuration_args(argdict, keys, default, use_compat)
3706
3707
3708 class ISO639Utils:
3709 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3710 _lang_map = {
3711 'aa': 'aar',
3712 'ab': 'abk',
3713 'ae': 'ave',
3714 'af': 'afr',
3715 'ak': 'aka',
3716 'am': 'amh',
3717 'an': 'arg',
3718 'ar': 'ara',
3719 'as': 'asm',
3720 'av': 'ava',
3721 'ay': 'aym',
3722 'az': 'aze',
3723 'ba': 'bak',
3724 'be': 'bel',
3725 'bg': 'bul',
3726 'bh': 'bih',
3727 'bi': 'bis',
3728 'bm': 'bam',
3729 'bn': 'ben',
3730 'bo': 'bod',
3731 'br': 'bre',
3732 'bs': 'bos',
3733 'ca': 'cat',
3734 'ce': 'che',
3735 'ch': 'cha',
3736 'co': 'cos',
3737 'cr': 'cre',
3738 'cs': 'ces',
3739 'cu': 'chu',
3740 'cv': 'chv',
3741 'cy': 'cym',
3742 'da': 'dan',
3743 'de': 'deu',
3744 'dv': 'div',
3745 'dz': 'dzo',
3746 'ee': 'ewe',
3747 'el': 'ell',
3748 'en': 'eng',
3749 'eo': 'epo',
3750 'es': 'spa',
3751 'et': 'est',
3752 'eu': 'eus',
3753 'fa': 'fas',
3754 'ff': 'ful',
3755 'fi': 'fin',
3756 'fj': 'fij',
3757 'fo': 'fao',
3758 'fr': 'fra',
3759 'fy': 'fry',
3760 'ga': 'gle',
3761 'gd': 'gla',
3762 'gl': 'glg',
3763 'gn': 'grn',
3764 'gu': 'guj',
3765 'gv': 'glv',
3766 'ha': 'hau',
3767 'he': 'heb',
3768 'iw': 'heb', # Replaced by he in 1989 revision
3769 'hi': 'hin',
3770 'ho': 'hmo',
3771 'hr': 'hrv',
3772 'ht': 'hat',
3773 'hu': 'hun',
3774 'hy': 'hye',
3775 'hz': 'her',
3776 'ia': 'ina',
3777 'id': 'ind',
3778 'in': 'ind', # Replaced by id in 1989 revision
3779 'ie': 'ile',
3780 'ig': 'ibo',
3781 'ii': 'iii',
3782 'ik': 'ipk',
3783 'io': 'ido',
3784 'is': 'isl',
3785 'it': 'ita',
3786 'iu': 'iku',
3787 'ja': 'jpn',
3788 'jv': 'jav',
3789 'ka': 'kat',
3790 'kg': 'kon',
3791 'ki': 'kik',
3792 'kj': 'kua',
3793 'kk': 'kaz',
3794 'kl': 'kal',
3795 'km': 'khm',
3796 'kn': 'kan',
3797 'ko': 'kor',
3798 'kr': 'kau',
3799 'ks': 'kas',
3800 'ku': 'kur',
3801 'kv': 'kom',
3802 'kw': 'cor',
3803 'ky': 'kir',
3804 'la': 'lat',
3805 'lb': 'ltz',
3806 'lg': 'lug',
3807 'li': 'lim',
3808 'ln': 'lin',
3809 'lo': 'lao',
3810 'lt': 'lit',
3811 'lu': 'lub',
3812 'lv': 'lav',
3813 'mg': 'mlg',
3814 'mh': 'mah',
3815 'mi': 'mri',
3816 'mk': 'mkd',
3817 'ml': 'mal',
3818 'mn': 'mon',
3819 'mr': 'mar',
3820 'ms': 'msa',
3821 'mt': 'mlt',
3822 'my': 'mya',
3823 'na': 'nau',
3824 'nb': 'nob',
3825 'nd': 'nde',
3826 'ne': 'nep',
3827 'ng': 'ndo',
3828 'nl': 'nld',
3829 'nn': 'nno',
3830 'no': 'nor',
3831 'nr': 'nbl',
3832 'nv': 'nav',
3833 'ny': 'nya',
3834 'oc': 'oci',
3835 'oj': 'oji',
3836 'om': 'orm',
3837 'or': 'ori',
3838 'os': 'oss',
3839 'pa': 'pan',
3840 'pi': 'pli',
3841 'pl': 'pol',
3842 'ps': 'pus',
3843 'pt': 'por',
3844 'qu': 'que',
3845 'rm': 'roh',
3846 'rn': 'run',
3847 'ro': 'ron',
3848 'ru': 'rus',
3849 'rw': 'kin',
3850 'sa': 'san',
3851 'sc': 'srd',
3852 'sd': 'snd',
3853 'se': 'sme',
3854 'sg': 'sag',
3855 'si': 'sin',
3856 'sk': 'slk',
3857 'sl': 'slv',
3858 'sm': 'smo',
3859 'sn': 'sna',
3860 'so': 'som',
3861 'sq': 'sqi',
3862 'sr': 'srp',
3863 'ss': 'ssw',
3864 'st': 'sot',
3865 'su': 'sun',
3866 'sv': 'swe',
3867 'sw': 'swa',
3868 'ta': 'tam',
3869 'te': 'tel',
3870 'tg': 'tgk',
3871 'th': 'tha',
3872 'ti': 'tir',
3873 'tk': 'tuk',
3874 'tl': 'tgl',
3875 'tn': 'tsn',
3876 'to': 'ton',
3877 'tr': 'tur',
3878 'ts': 'tso',
3879 'tt': 'tat',
3880 'tw': 'twi',
3881 'ty': 'tah',
3882 'ug': 'uig',
3883 'uk': 'ukr',
3884 'ur': 'urd',
3885 'uz': 'uzb',
3886 've': 'ven',
3887 'vi': 'vie',
3888 'vo': 'vol',
3889 'wa': 'wln',
3890 'wo': 'wol',
3891 'xh': 'xho',
3892 'yi': 'yid',
3893 'ji': 'yid', # Replaced by yi in 1989 revision
3894 'yo': 'yor',
3895 'za': 'zha',
3896 'zh': 'zho',
3897 'zu': 'zul',
3898 }
3899
3900 @classmethod
3901 def short2long(cls, code):
3902 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3903 return cls._lang_map.get(code[:2])
3904
3905 @classmethod
3906 def long2short(cls, code):
3907 """Convert language code from ISO 639-2/T to ISO 639-1"""
3908 for short_name, long_name in cls._lang_map.items():
3909 if long_name == code:
3910 return short_name
3911
3912
3913 class ISO3166Utils:
3914 # From http://data.okfn.org/data/core/country-list
3915 _country_map = {
3916 'AF': 'Afghanistan',
3917 'AX': 'Åland Islands',
3918 'AL': 'Albania',
3919 'DZ': 'Algeria',
3920 'AS': 'American Samoa',
3921 'AD': 'Andorra',
3922 'AO': 'Angola',
3923 'AI': 'Anguilla',
3924 'AQ': 'Antarctica',
3925 'AG': 'Antigua and Barbuda',
3926 'AR': 'Argentina',
3927 'AM': 'Armenia',
3928 'AW': 'Aruba',
3929 'AU': 'Australia',
3930 'AT': 'Austria',
3931 'AZ': 'Azerbaijan',
3932 'BS': 'Bahamas',
3933 'BH': 'Bahrain',
3934 'BD': 'Bangladesh',
3935 'BB': 'Barbados',
3936 'BY': 'Belarus',
3937 'BE': 'Belgium',
3938 'BZ': 'Belize',
3939 'BJ': 'Benin',
3940 'BM': 'Bermuda',
3941 'BT': 'Bhutan',
3942 'BO': 'Bolivia, Plurinational State of',
3943 'BQ': 'Bonaire, Sint Eustatius and Saba',
3944 'BA': 'Bosnia and Herzegovina',
3945 'BW': 'Botswana',
3946 'BV': 'Bouvet Island',
3947 'BR': 'Brazil',
3948 'IO': 'British Indian Ocean Territory',
3949 'BN': 'Brunei Darussalam',
3950 'BG': 'Bulgaria',
3951 'BF': 'Burkina Faso',
3952 'BI': 'Burundi',
3953 'KH': 'Cambodia',
3954 'CM': 'Cameroon',
3955 'CA': 'Canada',
3956 'CV': 'Cape Verde',
3957 'KY': 'Cayman Islands',
3958 'CF': 'Central African Republic',
3959 'TD': 'Chad',
3960 'CL': 'Chile',
3961 'CN': 'China',
3962 'CX': 'Christmas Island',
3963 'CC': 'Cocos (Keeling) Islands',
3964 'CO': 'Colombia',
3965 'KM': 'Comoros',
3966 'CG': 'Congo',
3967 'CD': 'Congo, the Democratic Republic of the',
3968 'CK': 'Cook Islands',
3969 'CR': 'Costa Rica',
3970 'CI': 'Côte d\'Ivoire',
3971 'HR': 'Croatia',
3972 'CU': 'Cuba',
3973 'CW': 'Curaçao',
3974 'CY': 'Cyprus',
3975 'CZ': 'Czech Republic',
3976 'DK': 'Denmark',
3977 'DJ': 'Djibouti',
3978 'DM': 'Dominica',
3979 'DO': 'Dominican Republic',
3980 'EC': 'Ecuador',
3981 'EG': 'Egypt',
3982 'SV': 'El Salvador',
3983 'GQ': 'Equatorial Guinea',
3984 'ER': 'Eritrea',
3985 'EE': 'Estonia',
3986 'ET': 'Ethiopia',
3987 'FK': 'Falkland Islands (Malvinas)',
3988 'FO': 'Faroe Islands',
3989 'FJ': 'Fiji',
3990 'FI': 'Finland',
3991 'FR': 'France',
3992 'GF': 'French Guiana',
3993 'PF': 'French Polynesia',
3994 'TF': 'French Southern Territories',
3995 'GA': 'Gabon',
3996 'GM': 'Gambia',
3997 'GE': 'Georgia',
3998 'DE': 'Germany',
3999 'GH': 'Ghana',
4000 'GI': 'Gibraltar',
4001 'GR': 'Greece',
4002 'GL': 'Greenland',
4003 'GD': 'Grenada',
4004 'GP': 'Guadeloupe',
4005 'GU': 'Guam',
4006 'GT': 'Guatemala',
4007 'GG': 'Guernsey',
4008 'GN': 'Guinea',
4009 'GW': 'Guinea-Bissau',
4010 'GY': 'Guyana',
4011 'HT': 'Haiti',
4012 'HM': 'Heard Island and McDonald Islands',
4013 'VA': 'Holy See (Vatican City State)',
4014 'HN': 'Honduras',
4015 'HK': 'Hong Kong',
4016 'HU': 'Hungary',
4017 'IS': 'Iceland',
4018 'IN': 'India',
4019 'ID': 'Indonesia',
4020 'IR': 'Iran, Islamic Republic of',
4021 'IQ': 'Iraq',
4022 'IE': 'Ireland',
4023 'IM': 'Isle of Man',
4024 'IL': 'Israel',
4025 'IT': 'Italy',
4026 'JM': 'Jamaica',
4027 'JP': 'Japan',
4028 'JE': 'Jersey',
4029 'JO': 'Jordan',
4030 'KZ': 'Kazakhstan',
4031 'KE': 'Kenya',
4032 'KI': 'Kiribati',
4033 'KP': 'Korea, Democratic People\'s Republic of',
4034 'KR': 'Korea, Republic of',
4035 'KW': 'Kuwait',
4036 'KG': 'Kyrgyzstan',
4037 'LA': 'Lao People\'s Democratic Republic',
4038 'LV': 'Latvia',
4039 'LB': 'Lebanon',
4040 'LS': 'Lesotho',
4041 'LR': 'Liberia',
4042 'LY': 'Libya',
4043 'LI': 'Liechtenstein',
4044 'LT': 'Lithuania',
4045 'LU': 'Luxembourg',
4046 'MO': 'Macao',
4047 'MK': 'Macedonia, the Former Yugoslav Republic of',
4048 'MG': 'Madagascar',
4049 'MW': 'Malawi',
4050 'MY': 'Malaysia',
4051 'MV': 'Maldives',
4052 'ML': 'Mali',
4053 'MT': 'Malta',
4054 'MH': 'Marshall Islands',
4055 'MQ': 'Martinique',
4056 'MR': 'Mauritania',
4057 'MU': 'Mauritius',
4058 'YT': 'Mayotte',
4059 'MX': 'Mexico',
4060 'FM': 'Micronesia, Federated States of',
4061 'MD': 'Moldova, Republic of',
4062 'MC': 'Monaco',
4063 'MN': 'Mongolia',
4064 'ME': 'Montenegro',
4065 'MS': 'Montserrat',
4066 'MA': 'Morocco',
4067 'MZ': 'Mozambique',
4068 'MM': 'Myanmar',
4069 'NA': 'Namibia',
4070 'NR': 'Nauru',
4071 'NP': 'Nepal',
4072 'NL': 'Netherlands',
4073 'NC': 'New Caledonia',
4074 'NZ': 'New Zealand',
4075 'NI': 'Nicaragua',
4076 'NE': 'Niger',
4077 'NG': 'Nigeria',
4078 'NU': 'Niue',
4079 'NF': 'Norfolk Island',
4080 'MP': 'Northern Mariana Islands',
4081 'NO': 'Norway',
4082 'OM': 'Oman',
4083 'PK': 'Pakistan',
4084 'PW': 'Palau',
4085 'PS': 'Palestine, State of',
4086 'PA': 'Panama',
4087 'PG': 'Papua New Guinea',
4088 'PY': 'Paraguay',
4089 'PE': 'Peru',
4090 'PH': 'Philippines',
4091 'PN': 'Pitcairn',
4092 'PL': 'Poland',
4093 'PT': 'Portugal',
4094 'PR': 'Puerto Rico',
4095 'QA': 'Qatar',
4096 'RE': 'Réunion',
4097 'RO': 'Romania',
4098 'RU': 'Russian Federation',
4099 'RW': 'Rwanda',
4100 'BL': 'Saint Barthélemy',
4101 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4102 'KN': 'Saint Kitts and Nevis',
4103 'LC': 'Saint Lucia',
4104 'MF': 'Saint Martin (French part)',
4105 'PM': 'Saint Pierre and Miquelon',
4106 'VC': 'Saint Vincent and the Grenadines',
4107 'WS': 'Samoa',
4108 'SM': 'San Marino',
4109 'ST': 'Sao Tome and Principe',
4110 'SA': 'Saudi Arabia',
4111 'SN': 'Senegal',
4112 'RS': 'Serbia',
4113 'SC': 'Seychelles',
4114 'SL': 'Sierra Leone',
4115 'SG': 'Singapore',
4116 'SX': 'Sint Maarten (Dutch part)',
4117 'SK': 'Slovakia',
4118 'SI': 'Slovenia',
4119 'SB': 'Solomon Islands',
4120 'SO': 'Somalia',
4121 'ZA': 'South Africa',
4122 'GS': 'South Georgia and the South Sandwich Islands',
4123 'SS': 'South Sudan',
4124 'ES': 'Spain',
4125 'LK': 'Sri Lanka',
4126 'SD': 'Sudan',
4127 'SR': 'Suriname',
4128 'SJ': 'Svalbard and Jan Mayen',
4129 'SZ': 'Swaziland',
4130 'SE': 'Sweden',
4131 'CH': 'Switzerland',
4132 'SY': 'Syrian Arab Republic',
4133 'TW': 'Taiwan, Province of China',
4134 'TJ': 'Tajikistan',
4135 'TZ': 'Tanzania, United Republic of',
4136 'TH': 'Thailand',
4137 'TL': 'Timor-Leste',
4138 'TG': 'Togo',
4139 'TK': 'Tokelau',
4140 'TO': 'Tonga',
4141 'TT': 'Trinidad and Tobago',
4142 'TN': 'Tunisia',
4143 'TR': 'Turkey',
4144 'TM': 'Turkmenistan',
4145 'TC': 'Turks and Caicos Islands',
4146 'TV': 'Tuvalu',
4147 'UG': 'Uganda',
4148 'UA': 'Ukraine',
4149 'AE': 'United Arab Emirates',
4150 'GB': 'United Kingdom',
4151 'US': 'United States',
4152 'UM': 'United States Minor Outlying Islands',
4153 'UY': 'Uruguay',
4154 'UZ': 'Uzbekistan',
4155 'VU': 'Vanuatu',
4156 'VE': 'Venezuela, Bolivarian Republic of',
4157 'VN': 'Viet Nam',
4158 'VG': 'Virgin Islands, British',
4159 'VI': 'Virgin Islands, U.S.',
4160 'WF': 'Wallis and Futuna',
4161 'EH': 'Western Sahara',
4162 'YE': 'Yemen',
4163 'ZM': 'Zambia',
4164 'ZW': 'Zimbabwe',
4165 # Not ISO 3166 codes, but used for IP blocks
4166 'AP': 'Asia/Pacific Region',
4167 'EU': 'Europe',
4168 }
4169
4170 @classmethod
4171 def short2full(cls, code):
4172 """Convert an ISO 3166-2 country code to the corresponding full name"""
4173 return cls._country_map.get(code.upper())
4174
4175
4176 class GeoUtils:
4177 # Major IPv4 address blocks per country
4178 _country_ip_map = {
4179 'AD': '46.172.224.0/19',
4180 'AE': '94.200.0.0/13',
4181 'AF': '149.54.0.0/17',
4182 'AG': '209.59.64.0/18',
4183 'AI': '204.14.248.0/21',
4184 'AL': '46.99.0.0/16',
4185 'AM': '46.70.0.0/15',
4186 'AO': '105.168.0.0/13',
4187 'AP': '182.50.184.0/21',
4188 'AQ': '23.154.160.0/24',
4189 'AR': '181.0.0.0/12',
4190 'AS': '202.70.112.0/20',
4191 'AT': '77.116.0.0/14',
4192 'AU': '1.128.0.0/11',
4193 'AW': '181.41.0.0/18',
4194 'AX': '185.217.4.0/22',
4195 'AZ': '5.197.0.0/16',
4196 'BA': '31.176.128.0/17',
4197 'BB': '65.48.128.0/17',
4198 'BD': '114.130.0.0/16',
4199 'BE': '57.0.0.0/8',
4200 'BF': '102.178.0.0/15',
4201 'BG': '95.42.0.0/15',
4202 'BH': '37.131.0.0/17',
4203 'BI': '154.117.192.0/18',
4204 'BJ': '137.255.0.0/16',
4205 'BL': '185.212.72.0/23',
4206 'BM': '196.12.64.0/18',
4207 'BN': '156.31.0.0/16',
4208 'BO': '161.56.0.0/16',
4209 'BQ': '161.0.80.0/20',
4210 'BR': '191.128.0.0/12',
4211 'BS': '24.51.64.0/18',
4212 'BT': '119.2.96.0/19',
4213 'BW': '168.167.0.0/16',
4214 'BY': '178.120.0.0/13',
4215 'BZ': '179.42.192.0/18',
4216 'CA': '99.224.0.0/11',
4217 'CD': '41.243.0.0/16',
4218 'CF': '197.242.176.0/21',
4219 'CG': '160.113.0.0/16',
4220 'CH': '85.0.0.0/13',
4221 'CI': '102.136.0.0/14',
4222 'CK': '202.65.32.0/19',
4223 'CL': '152.172.0.0/14',
4224 'CM': '102.244.0.0/14',
4225 'CN': '36.128.0.0/10',
4226 'CO': '181.240.0.0/12',
4227 'CR': '201.192.0.0/12',
4228 'CU': '152.206.0.0/15',
4229 'CV': '165.90.96.0/19',
4230 'CW': '190.88.128.0/17',
4231 'CY': '31.153.0.0/16',
4232 'CZ': '88.100.0.0/14',
4233 'DE': '53.0.0.0/8',
4234 'DJ': '197.241.0.0/17',
4235 'DK': '87.48.0.0/12',
4236 'DM': '192.243.48.0/20',
4237 'DO': '152.166.0.0/15',
4238 'DZ': '41.96.0.0/12',
4239 'EC': '186.68.0.0/15',
4240 'EE': '90.190.0.0/15',
4241 'EG': '156.160.0.0/11',
4242 'ER': '196.200.96.0/20',
4243 'ES': '88.0.0.0/11',
4244 'ET': '196.188.0.0/14',
4245 'EU': '2.16.0.0/13',
4246 'FI': '91.152.0.0/13',
4247 'FJ': '144.120.0.0/16',
4248 'FK': '80.73.208.0/21',
4249 'FM': '119.252.112.0/20',
4250 'FO': '88.85.32.0/19',
4251 'FR': '90.0.0.0/9',
4252 'GA': '41.158.0.0/15',
4253 'GB': '25.0.0.0/8',
4254 'GD': '74.122.88.0/21',
4255 'GE': '31.146.0.0/16',
4256 'GF': '161.22.64.0/18',
4257 'GG': '62.68.160.0/19',
4258 'GH': '154.160.0.0/12',
4259 'GI': '95.164.0.0/16',
4260 'GL': '88.83.0.0/19',
4261 'GM': '160.182.0.0/15',
4262 'GN': '197.149.192.0/18',
4263 'GP': '104.250.0.0/19',
4264 'GQ': '105.235.224.0/20',
4265 'GR': '94.64.0.0/13',
4266 'GT': '168.234.0.0/16',
4267 'GU': '168.123.0.0/16',
4268 'GW': '197.214.80.0/20',
4269 'GY': '181.41.64.0/18',
4270 'HK': '113.252.0.0/14',
4271 'HN': '181.210.0.0/16',
4272 'HR': '93.136.0.0/13',
4273 'HT': '148.102.128.0/17',
4274 'HU': '84.0.0.0/14',
4275 'ID': '39.192.0.0/10',
4276 'IE': '87.32.0.0/12',
4277 'IL': '79.176.0.0/13',
4278 'IM': '5.62.80.0/20',
4279 'IN': '117.192.0.0/10',
4280 'IO': '203.83.48.0/21',
4281 'IQ': '37.236.0.0/14',
4282 'IR': '2.176.0.0/12',
4283 'IS': '82.221.0.0/16',
4284 'IT': '79.0.0.0/10',
4285 'JE': '87.244.64.0/18',
4286 'JM': '72.27.0.0/17',
4287 'JO': '176.29.0.0/16',
4288 'JP': '133.0.0.0/8',
4289 'KE': '105.48.0.0/12',
4290 'KG': '158.181.128.0/17',
4291 'KH': '36.37.128.0/17',
4292 'KI': '103.25.140.0/22',
4293 'KM': '197.255.224.0/20',
4294 'KN': '198.167.192.0/19',
4295 'KP': '175.45.176.0/22',
4296 'KR': '175.192.0.0/10',
4297 'KW': '37.36.0.0/14',
4298 'KY': '64.96.0.0/15',
4299 'KZ': '2.72.0.0/13',
4300 'LA': '115.84.64.0/18',
4301 'LB': '178.135.0.0/16',
4302 'LC': '24.92.144.0/20',
4303 'LI': '82.117.0.0/19',
4304 'LK': '112.134.0.0/15',
4305 'LR': '102.183.0.0/16',
4306 'LS': '129.232.0.0/17',
4307 'LT': '78.56.0.0/13',
4308 'LU': '188.42.0.0/16',
4309 'LV': '46.109.0.0/16',
4310 'LY': '41.252.0.0/14',
4311 'MA': '105.128.0.0/11',
4312 'MC': '88.209.64.0/18',
4313 'MD': '37.246.0.0/16',
4314 'ME': '178.175.0.0/17',
4315 'MF': '74.112.232.0/21',
4316 'MG': '154.126.0.0/17',
4317 'MH': '117.103.88.0/21',
4318 'MK': '77.28.0.0/15',
4319 'ML': '154.118.128.0/18',
4320 'MM': '37.111.0.0/17',
4321 'MN': '49.0.128.0/17',
4322 'MO': '60.246.0.0/16',
4323 'MP': '202.88.64.0/20',
4324 'MQ': '109.203.224.0/19',
4325 'MR': '41.188.64.0/18',
4326 'MS': '208.90.112.0/22',
4327 'MT': '46.11.0.0/16',
4328 'MU': '105.16.0.0/12',
4329 'MV': '27.114.128.0/18',
4330 'MW': '102.70.0.0/15',
4331 'MX': '187.192.0.0/11',
4332 'MY': '175.136.0.0/13',
4333 'MZ': '197.218.0.0/15',
4334 'NA': '41.182.0.0/16',
4335 'NC': '101.101.0.0/18',
4336 'NE': '197.214.0.0/18',
4337 'NF': '203.17.240.0/22',
4338 'NG': '105.112.0.0/12',
4339 'NI': '186.76.0.0/15',
4340 'NL': '145.96.0.0/11',
4341 'NO': '84.208.0.0/13',
4342 'NP': '36.252.0.0/15',
4343 'NR': '203.98.224.0/19',
4344 'NU': '49.156.48.0/22',
4345 'NZ': '49.224.0.0/14',
4346 'OM': '5.36.0.0/15',
4347 'PA': '186.72.0.0/15',
4348 'PE': '186.160.0.0/14',
4349 'PF': '123.50.64.0/18',
4350 'PG': '124.240.192.0/19',
4351 'PH': '49.144.0.0/13',
4352 'PK': '39.32.0.0/11',
4353 'PL': '83.0.0.0/11',
4354 'PM': '70.36.0.0/20',
4355 'PR': '66.50.0.0/16',
4356 'PS': '188.161.0.0/16',
4357 'PT': '85.240.0.0/13',
4358 'PW': '202.124.224.0/20',
4359 'PY': '181.120.0.0/14',
4360 'QA': '37.210.0.0/15',
4361 'RE': '102.35.0.0/16',
4362 'RO': '79.112.0.0/13',
4363 'RS': '93.86.0.0/15',
4364 'RU': '5.136.0.0/13',
4365 'RW': '41.186.0.0/16',
4366 'SA': '188.48.0.0/13',
4367 'SB': '202.1.160.0/19',
4368 'SC': '154.192.0.0/11',
4369 'SD': '102.120.0.0/13',
4370 'SE': '78.64.0.0/12',
4371 'SG': '8.128.0.0/10',
4372 'SI': '188.196.0.0/14',
4373 'SK': '78.98.0.0/15',
4374 'SL': '102.143.0.0/17',
4375 'SM': '89.186.32.0/19',
4376 'SN': '41.82.0.0/15',
4377 'SO': '154.115.192.0/18',
4378 'SR': '186.179.128.0/17',
4379 'SS': '105.235.208.0/21',
4380 'ST': '197.159.160.0/19',
4381 'SV': '168.243.0.0/16',
4382 'SX': '190.102.0.0/20',
4383 'SY': '5.0.0.0/16',
4384 'SZ': '41.84.224.0/19',
4385 'TC': '65.255.48.0/20',
4386 'TD': '154.68.128.0/19',
4387 'TG': '196.168.0.0/14',
4388 'TH': '171.96.0.0/13',
4389 'TJ': '85.9.128.0/18',
4390 'TK': '27.96.24.0/21',
4391 'TL': '180.189.160.0/20',
4392 'TM': '95.85.96.0/19',
4393 'TN': '197.0.0.0/11',
4394 'TO': '175.176.144.0/21',
4395 'TR': '78.160.0.0/11',
4396 'TT': '186.44.0.0/15',
4397 'TV': '202.2.96.0/19',
4398 'TW': '120.96.0.0/11',
4399 'TZ': '156.156.0.0/14',
4400 'UA': '37.52.0.0/14',
4401 'UG': '102.80.0.0/13',
4402 'US': '6.0.0.0/8',
4403 'UY': '167.56.0.0/13',
4404 'UZ': '84.54.64.0/18',
4405 'VA': '212.77.0.0/19',
4406 'VC': '207.191.240.0/21',
4407 'VE': '186.88.0.0/13',
4408 'VG': '66.81.192.0/20',
4409 'VI': '146.226.0.0/16',
4410 'VN': '14.160.0.0/11',
4411 'VU': '202.80.32.0/20',
4412 'WF': '117.20.32.0/21',
4413 'WS': '202.4.32.0/19',
4414 'YE': '134.35.0.0/16',
4415 'YT': '41.242.116.0/22',
4416 'ZA': '41.0.0.0/11',
4417 'ZM': '102.144.0.0/13',
4418 'ZW': '102.177.192.0/18',
4419 }
4420
4421 @classmethod
4422 def random_ipv4(cls, code_or_block):
4423 if len(code_or_block) == 2:
4424 block = cls._country_ip_map.get(code_or_block.upper())
4425 if not block:
4426 return None
4427 else:
4428 block = code_or_block
4429 addr, preflen = block.split('/')
4430 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4431 addr_max = addr_min | (0xffffffff >> int(preflen))
4432 return compat_str(socket.inet_ntoa(
4433 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4434
4435
4436 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4437 def __init__(self, proxies=None):
4438 # Set default handlers
4439 for type in ('http', 'https'):
4440 setattr(self, '%s_open' % type,
4441 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4442 meth(r, proxy, type))
4443 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4444
4445 def proxy_open(self, req, proxy, type):
4446 req_proxy = req.headers.get('Ytdl-request-proxy')
4447 if req_proxy is not None:
4448 proxy = req_proxy
4449 del req.headers['Ytdl-request-proxy']
4450
4451 if proxy == '__noproxy__':
4452 return None # No Proxy
4453 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4454 req.add_header('Ytdl-socks-proxy', proxy)
4455 # yt-dlp's http/https handlers do wrapping the socket with socks
4456 return None
4457 return compat_urllib_request.ProxyHandler.proxy_open(
4458 self, req, proxy, type)
4459
4460
4461 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4462 # released into Public Domain
4463 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4464
4465 def long_to_bytes(n, blocksize=0):
4466 """long_to_bytes(n:long, blocksize:int) : string
4467 Convert a long integer to a byte string.
4468
4469 If optional blocksize is given and greater than zero, pad the front of the
4470 byte string with binary zeros so that the length is a multiple of
4471 blocksize.
4472 """
4473 # after much testing, this algorithm was deemed to be the fastest
4474 s = b''
4475 n = int(n)
4476 while n > 0:
4477 s = compat_struct_pack('>I', n & 0xffffffff) + s
4478 n = n >> 32
4479 # strip off leading zeros
4480 for i in range(len(s)):
4481 if s[i] != b'\000'[0]:
4482 break
4483 else:
4484 # only happens when n == 0
4485 s = b'\000'
4486 i = 0
4487 s = s[i:]
4488 # add back some pad bytes. this could be done more efficiently w.r.t. the
4489 # de-padding being done above, but sigh...
4490 if blocksize > 0 and len(s) % blocksize:
4491 s = (blocksize - len(s) % blocksize) * b'\000' + s
4492 return s
4493
4494
4495 def bytes_to_long(s):
4496 """bytes_to_long(string) : long
4497 Convert a byte string to a long integer.
4498
4499 This is (essentially) the inverse of long_to_bytes().
4500 """
4501 acc = 0
4502 length = len(s)
4503 if length % 4:
4504 extra = (4 - length % 4)
4505 s = b'\000' * extra + s
4506 length = length + extra
4507 for i in range(0, length, 4):
4508 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4509 return acc
4510
4511
4512 def ohdave_rsa_encrypt(data, exponent, modulus):
4513 '''
4514 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4515
4516 Input:
4517 data: data to encrypt, bytes-like object
4518 exponent, modulus: parameter e and N of RSA algorithm, both integer
4519 Output: hex string of encrypted data
4520
4521 Limitation: supports one block encryption only
4522 '''
4523
4524 payload = int(binascii.hexlify(data[::-1]), 16)
4525 encrypted = pow(payload, exponent, modulus)
4526 return '%x' % encrypted
4527
4528
4529 def pkcs1pad(data, length):
4530 """
4531 Padding input data with PKCS#1 scheme
4532
4533 @param {int[]} data input data
4534 @param {int} length target length
4535 @returns {int[]} padded data
4536 """
4537 if len(data) > length - 11:
4538 raise ValueError('Input data too long for PKCS#1 padding')
4539
4540 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4541 return [0, 2] + pseudo_random + [0] + data
4542
4543
4544 def encode_base_n(num, n, table=None):
4545 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4546 if not table:
4547 table = FULL_TABLE[:n]
4548
4549 if n > len(table):
4550 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4551
4552 if num == 0:
4553 return table[0]
4554
4555 ret = ''
4556 while num:
4557 ret = table[num % n] + ret
4558 num = num // n
4559 return ret
4560
4561
4562 def decode_packed_codes(code):
4563 mobj = re.search(PACKED_CODES_RE, code)
4564 obfuscated_code, base, count, symbols = mobj.groups()
4565 base = int(base)
4566 count = int(count)
4567 symbols = symbols.split('|')
4568 symbol_table = {}
4569
4570 while count:
4571 count -= 1
4572 base_n_count = encode_base_n(count, base)
4573 symbol_table[base_n_count] = symbols[count] or base_n_count
4574
4575 return re.sub(
4576 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4577 obfuscated_code)
4578
4579
4580 def caesar(s, alphabet, shift):
4581 if shift == 0:
4582 return s
4583 l = len(alphabet)
4584 return ''.join(
4585 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4586 for c in s)
4587
4588
4589 def rot47(s):
4590 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4591
4592
4593 def parse_m3u8_attributes(attrib):
4594 info = {}
4595 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4596 if val.startswith('"'):
4597 val = val[1:-1]
4598 info[key] = val
4599 return info
4600
4601
4602 def urshift(val, n):
4603 return val >> n if val >= 0 else (val + 0x100000000) >> n
4604
4605
4606 # Based on png2str() written by @gdkchan and improved by @yokrysty
4607 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4608 def decode_png(png_data):
4609 # Reference: https://www.w3.org/TR/PNG/
4610 header = png_data[8:]
4611
4612 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4613 raise OSError('Not a valid PNG file.')
4614
4615 int_map = {1: '>B', 2: '>H', 4: '>I'}
4616 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4617
4618 chunks = []
4619
4620 while header:
4621 length = unpack_integer(header[:4])
4622 header = header[4:]
4623
4624 chunk_type = header[:4]
4625 header = header[4:]
4626
4627 chunk_data = header[:length]
4628 header = header[length:]
4629
4630 header = header[4:] # Skip CRC
4631
4632 chunks.append({
4633 'type': chunk_type,
4634 'length': length,
4635 'data': chunk_data
4636 })
4637
4638 ihdr = chunks[0]['data']
4639
4640 width = unpack_integer(ihdr[:4])
4641 height = unpack_integer(ihdr[4:8])
4642
4643 idat = b''
4644
4645 for chunk in chunks:
4646 if chunk['type'] == b'IDAT':
4647 idat += chunk['data']
4648
4649 if not idat:
4650 raise OSError('Unable to read PNG data.')
4651
4652 decompressed_data = bytearray(zlib.decompress(idat))
4653
4654 stride = width * 3
4655 pixels = []
4656
4657 def _get_pixel(idx):
4658 x = idx % stride
4659 y = idx // stride
4660 return pixels[y][x]
4661
4662 for y in range(height):
4663 basePos = y * (1 + stride)
4664 filter_type = decompressed_data[basePos]
4665
4666 current_row = []
4667
4668 pixels.append(current_row)
4669
4670 for x in range(stride):
4671 color = decompressed_data[1 + basePos + x]
4672 basex = y * stride + x
4673 left = 0
4674 up = 0
4675
4676 if x > 2:
4677 left = _get_pixel(basex - 3)
4678 if y > 0:
4679 up = _get_pixel(basex - stride)
4680
4681 if filter_type == 1: # Sub
4682 color = (color + left) & 0xff
4683 elif filter_type == 2: # Up
4684 color = (color + up) & 0xff
4685 elif filter_type == 3: # Average
4686 color = (color + ((left + up) >> 1)) & 0xff
4687 elif filter_type == 4: # Paeth
4688 a = left
4689 b = up
4690 c = 0
4691
4692 if x > 2 and y > 0:
4693 c = _get_pixel(basex - stride - 3)
4694
4695 p = a + b - c
4696
4697 pa = abs(p - a)
4698 pb = abs(p - b)
4699 pc = abs(p - c)
4700
4701 if pa <= pb and pa <= pc:
4702 color = (color + a) & 0xff
4703 elif pb <= pc:
4704 color = (color + b) & 0xff
4705 else:
4706 color = (color + c) & 0xff
4707
4708 current_row.append(color)
4709
4710 return width, height, pixels
4711
4712
4713 def write_xattr(path, key, value):
4714 # Windows: Write xattrs to NTFS Alternate Data Streams:
4715 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4716 if compat_os_name == 'nt':
4717 assert ':' not in key
4718 assert os.path.exists(path)
4719
4720 try:
4721 with open(f'{path}:{key}', 'wb') as f:
4722 f.write(value)
4723 except OSError as e:
4724 raise XAttrMetadataError(e.errno, e.strerror)
4725 return
4726
4727 # UNIX Method 1. Use xattrs/pyxattrs modules
4728 from .dependencies import xattr
4729
4730 setxattr = None
4731 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4732 # Unicode arguments are not supported in pyxattr until version 0.5.0
4733 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4734 if version_tuple(xattr.__version__) >= (0, 5, 0):
4735 setxattr = xattr.set
4736 elif xattr:
4737 setxattr = xattr.setxattr
4738
4739 if setxattr:
4740 try:
4741 setxattr(path, key, value)
4742 except OSError as e:
4743 raise XAttrMetadataError(e.errno, e.strerror)
4744 return
4745
4746 # UNIX Method 2. Use setfattr/xattr executables
4747 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4748 else 'xattr' if check_executable('xattr', ['-h']) else None)
4749 if not exe:
4750 raise XAttrUnavailableError(
4751 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4752 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4753
4754 value = value.decode()
4755 try:
4756 p = Popen(
4757 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4758 stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4759 except OSError as e:
4760 raise XAttrMetadataError(e.errno, e.strerror)
4761 stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace')
4762 if p.returncode:
4763 raise XAttrMetadataError(p.returncode, stderr)
4764
4765
4766 def random_birthday(year_field, month_field, day_field):
4767 start_date = datetime.date(1950, 1, 1)
4768 end_date = datetime.date(1995, 12, 31)
4769 offset = random.randint(0, (end_date - start_date).days)
4770 random_date = start_date + datetime.timedelta(offset)
4771 return {
4772 year_field: str(random_date.year),
4773 month_field: str(random_date.month),
4774 day_field: str(random_date.day),
4775 }
4776
4777
4778 # Templates for internet shortcut files, which are plain text files.
4779 DOT_URL_LINK_TEMPLATE = '''\
4780 [InternetShortcut]
4781 URL=%(url)s
4782 '''
4783
4784 DOT_WEBLOC_LINK_TEMPLATE = '''\
4785 <?xml version="1.0" encoding="UTF-8"?>
4786 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4787 <plist version="1.0">
4788 <dict>
4789 \t<key>URL</key>
4790 \t<string>%(url)s</string>
4791 </dict>
4792 </plist>
4793 '''
4794
4795 DOT_DESKTOP_LINK_TEMPLATE = '''\
4796 [Desktop Entry]
4797 Encoding=UTF-8
4798 Name=%(filename)s
4799 Type=Link
4800 URL=%(url)s
4801 Icon=text-html
4802 '''
4803
4804 LINK_TEMPLATES = {
4805 'url': DOT_URL_LINK_TEMPLATE,
4806 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4807 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4808 }
4809
4810
4811 def iri_to_uri(iri):
4812 """
4813 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4814
4815 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4816 """
4817
4818 iri_parts = compat_urllib_parse_urlparse(iri)
4819
4820 if '[' in iri_parts.netloc:
4821 raise ValueError('IPv6 URIs are not, yet, supported.')
4822 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4823
4824 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4825
4826 net_location = ''
4827 if iri_parts.username:
4828 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4829 if iri_parts.password is not None:
4830 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4831 net_location += '@'
4832
4833 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
4834 # The 'idna' encoding produces ASCII text.
4835 if iri_parts.port is not None and iri_parts.port != 80:
4836 net_location += ':' + str(iri_parts.port)
4837
4838 return urllib.parse.urlunparse(
4839 (iri_parts.scheme,
4840 net_location,
4841
4842 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4843
4844 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4845 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4846
4847 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4848 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4849
4850 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4851
4852 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4853
4854
4855 def to_high_limit_path(path):
4856 if sys.platform in ['win32', 'cygwin']:
4857 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4858 return '\\\\?\\' + os.path.abspath(path)
4859
4860 return path
4861
4862
4863 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4864 val = traverse_obj(obj, *variadic(field))
4865 if val in ignore:
4866 return default
4867 return template % (func(val) if func else val)
4868
4869
4870 def clean_podcast_url(url):
4871 return re.sub(r'''(?x)
4872 (?:
4873 (?:
4874 chtbl\.com/track|
4875 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4876 play\.podtrac\.com
4877 )/[^/]+|
4878 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4879 flex\.acast\.com|
4880 pd(?:
4881 cn\.co| # https://podcorn.com/analytics-prefix/
4882 st\.fm # https://podsights.com/docs/
4883 )/e
4884 )/''', '', url)
4885
4886
4887 _HEX_TABLE = '0123456789abcdef'
4888
4889
4890 def random_uuidv4():
4891 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4892
4893
4894 def make_dir(path, to_screen=None):
4895 try:
4896 dn = os.path.dirname(path)
4897 if dn and not os.path.exists(dn):
4898 os.makedirs(dn)
4899 return True
4900 except OSError as err:
4901 if callable(to_screen) is not None:
4902 to_screen('unable to create directory ' + error_to_compat_str(err))
4903 return False
4904
4905
4906 def get_executable_path():
4907 from .update import get_variant_and_executable_path
4908
4909 return os.path.abspath(get_variant_and_executable_path()[1])
4910
4911
4912 def load_plugins(name, suffix, namespace):
4913 classes = {}
4914 with contextlib.suppress(FileNotFoundError):
4915 plugins_spec = importlib.util.spec_from_file_location(
4916 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4917 plugins = importlib.util.module_from_spec(plugins_spec)
4918 sys.modules[plugins_spec.name] = plugins
4919 plugins_spec.loader.exec_module(plugins)
4920 for name in dir(plugins):
4921 if name in namespace:
4922 continue
4923 if not name.endswith(suffix):
4924 continue
4925 klass = getattr(plugins, name)
4926 classes[name] = namespace[name] = klass
4927 return classes
4928
4929
4930 def traverse_obj(
4931 obj, *path_list, default=None, expected_type=None, get_all=True,
4932 casesense=True, is_user_input=False, traverse_string=False):
4933 ''' Traverse nested list/dict/tuple
4934 @param path_list A list of paths which are checked one by one.
4935 Each path is a list of keys where each key is a:
4936 - None: Do nothing
4937 - string: A dictionary key
4938 - int: An index into a list
4939 - tuple: A list of keys all of which will be traversed
4940 - Ellipsis: Fetch all values in the object
4941 - Function: Takes the key and value as arguments
4942 and returns whether the key matches or not
4943 @param default Default value to return
4944 @param expected_type Only accept final value of this type (Can also be any callable)
4945 @param get_all Return all the values obtained from a path or only the first one
4946 @param casesense Whether to consider dictionary keys as case sensitive
4947 @param is_user_input Whether the keys are generated from user input. If True,
4948 strings are converted to int/slice if necessary
4949 @param traverse_string Whether to traverse inside strings. If True, any
4950 non-compatible object will also be converted into a string
4951 # TODO: Write tests
4952 '''
4953 if not casesense:
4954 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4955 path_list = (map(_lower, variadic(path)) for path in path_list)
4956
4957 def _traverse_obj(obj, path, _current_depth=0):
4958 nonlocal depth
4959 path = tuple(variadic(path))
4960 for i, key in enumerate(path):
4961 if None in (key, obj):
4962 return obj
4963 if isinstance(key, (list, tuple)):
4964 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4965 key = ...
4966 if key is ...:
4967 obj = (obj.values() if isinstance(obj, dict)
4968 else obj if isinstance(obj, (list, tuple, LazyList))
4969 else str(obj) if traverse_string else [])
4970 _current_depth += 1
4971 depth = max(depth, _current_depth)
4972 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4973 elif callable(key):
4974 if isinstance(obj, (list, tuple, LazyList)):
4975 obj = enumerate(obj)
4976 elif isinstance(obj, dict):
4977 obj = obj.items()
4978 else:
4979 if not traverse_string:
4980 return None
4981 obj = str(obj)
4982 _current_depth += 1
4983 depth = max(depth, _current_depth)
4984 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
4985 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4986 obj = (obj.get(key) if casesense or (key in obj)
4987 else next((v for k, v in obj.items() if _lower(k) == key), None))
4988 else:
4989 if is_user_input:
4990 key = (int_or_none(key) if ':' not in key
4991 else slice(*map(int_or_none, key.split(':'))))
4992 if key == slice(None):
4993 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
4994 if not isinstance(key, (int, slice)):
4995 return None
4996 if not isinstance(obj, (list, tuple, LazyList)):
4997 if not traverse_string:
4998 return None
4999 obj = str(obj)
5000 try:
5001 obj = obj[key]
5002 except IndexError:
5003 return None
5004 return obj
5005
5006 if isinstance(expected_type, type):
5007 type_test = lambda val: val if isinstance(val, expected_type) else None
5008 elif expected_type is not None:
5009 type_test = expected_type
5010 else:
5011 type_test = lambda val: val
5012
5013 for path in path_list:
5014 depth = 0
5015 val = _traverse_obj(obj, path)
5016 if val is not None:
5017 if depth:
5018 for _ in range(depth - 1):
5019 val = itertools.chain.from_iterable(v for v in val if v is not None)
5020 val = [v for v in map(type_test, val) if v is not None]
5021 if val:
5022 return val if get_all else val[0]
5023 else:
5024 val = type_test(val)
5025 if val is not None:
5026 return val
5027 return default
5028
5029
5030 def traverse_dict(dictn, keys, casesense=True):
5031 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5032 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5033 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5034
5035
5036 def get_first(obj, keys, **kwargs):
5037 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5038
5039
5040 def variadic(x, allowed_types=(str, bytes, dict)):
5041 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5042
5043
5044 def decode_base(value, digits):
5045 # This will convert given base-x string to scalar (long or int)
5046 table = {char: index for index, char in enumerate(digits)}
5047 result = 0
5048 base = len(digits)
5049 for chr in value:
5050 result *= base
5051 result += table[chr]
5052 return result
5053
5054
5055 def time_seconds(**kwargs):
5056 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5057 return t.timestamp()
5058
5059
5060 # create a JSON Web Signature (jws) with HS256 algorithm
5061 # the resulting format is in JWS Compact Serialization
5062 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5063 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5064 def jwt_encode_hs256(payload_data, key, headers={}):
5065 header_data = {
5066 'alg': 'HS256',
5067 'typ': 'JWT',
5068 }
5069 if headers:
5070 header_data.update(headers)
5071 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5072 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5073 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5074 signature_b64 = base64.b64encode(h.digest())
5075 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5076 return token
5077
5078
5079 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5080 def jwt_decode_hs256(jwt):
5081 header_b64, payload_b64, signature_b64 = jwt.split('.')
5082 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5083 return payload_data
5084
5085
5086 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5087
5088
5089 @functools.cache
5090 def supports_terminal_sequences(stream):
5091 if compat_os_name == 'nt':
5092 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5093 return False
5094 elif not os.getenv('TERM'):
5095 return False
5096 try:
5097 return stream.isatty()
5098 except BaseException:
5099 return False
5100
5101
5102 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5103 if compat_os_name != 'nt':
5104 return
5105 global WINDOWS_VT_MODE
5106 startupinfo = subprocess.STARTUPINFO()
5107 startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
5108 try:
5109 subprocess.Popen('', shell=True, startupinfo=startupinfo).wait()
5110 except Exception:
5111 return
5112
5113 WINDOWS_VT_MODE = True
5114 supports_terminal_sequences.cache_clear()
5115
5116
5117 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5118
5119
5120 def remove_terminal_sequences(string):
5121 return _terminal_sequences_re.sub('', string)
5122
5123
5124 def number_of_digits(number):
5125 return len('%d' % number)
5126
5127
5128 def join_nonempty(*values, delim='-', from_dict=None):
5129 if from_dict is not None:
5130 values = map(from_dict.get, values)
5131 return delim.join(map(str, filter(None, values)))
5132
5133
5134 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5135 """
5136 Find the largest format dimensions in terms of video width and, for each thumbnail:
5137 * Modify the URL: Match the width with the provided regex and replace with the former width
5138 * Update dimensions
5139
5140 This function is useful with video services that scale the provided thumbnails on demand
5141 """
5142 _keys = ('width', 'height')
5143 max_dimensions = max(
5144 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5145 default=(0, 0))
5146 if not max_dimensions[0]:
5147 return thumbnails
5148 return [
5149 merge_dicts(
5150 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5151 dict(zip(_keys, max_dimensions)), thumbnail)
5152 for thumbnail in thumbnails
5153 ]
5154
5155
5156 def parse_http_range(range):
5157 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5158 if not range:
5159 return None, None, None
5160 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5161 if not crg:
5162 return None, None, None
5163 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5164
5165
5166 class Config:
5167 own_args = None
5168 parsed_args = None
5169 filename = None
5170 __initialized = False
5171
5172 def __init__(self, parser, label=None):
5173 self.parser, self.label = parser, label
5174 self._loaded_paths, self.configs = set(), []
5175
5176 def init(self, args=None, filename=None):
5177 assert not self.__initialized
5178 directory = ''
5179 if filename:
5180 location = os.path.realpath(filename)
5181 directory = os.path.dirname(location)
5182 if location in self._loaded_paths:
5183 return False
5184 self._loaded_paths.add(location)
5185
5186 self.own_args, self.__initialized = args, True
5187 opts, _ = self.parser.parse_known_args(args)
5188 self.parsed_args, self.filename = args, filename
5189
5190 for location in opts.config_locations or []:
5191 location = os.path.join(directory, expand_path(location))
5192 if os.path.isdir(location):
5193 location = os.path.join(location, 'yt-dlp.conf')
5194 if not os.path.exists(location):
5195 self.parser.error(f'config location {location} does not exist')
5196 self.append_config(self.read_file(location), location)
5197 return True
5198
5199 def __str__(self):
5200 label = join_nonempty(
5201 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5202 delim=' ')
5203 return join_nonempty(
5204 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5205 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5206 delim='\n')
5207
5208 @staticmethod
5209 def read_file(filename, default=[]):
5210 try:
5211 optionf = open(filename)
5212 except OSError:
5213 return default # silently skip if file is not present
5214 try:
5215 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5216 contents = optionf.read()
5217 res = shlex.split(contents, comments=True)
5218 finally:
5219 optionf.close()
5220 return res
5221
5222 @staticmethod
5223 def hide_login_info(opts):
5224 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5225 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5226
5227 def _scrub_eq(o):
5228 m = eqre.match(o)
5229 if m:
5230 return m.group('key') + '=PRIVATE'
5231 else:
5232 return o
5233
5234 opts = list(map(_scrub_eq, opts))
5235 for idx, opt in enumerate(opts):
5236 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5237 opts[idx + 1] = 'PRIVATE'
5238 return opts
5239
5240 def append_config(self, *args, label=None):
5241 config = type(self)(self.parser, label)
5242 config._loaded_paths = self._loaded_paths
5243 if config.init(*args):
5244 self.configs.append(config)
5245
5246 @property
5247 def all_args(self):
5248 for config in reversed(self.configs):
5249 yield from config.all_args
5250 yield from self.parsed_args or []
5251
5252 def parse_known_args(self, **kwargs):
5253 return self.parser.parse_known_args(self.all_args, **kwargs)
5254
5255 def parse_args(self):
5256 return self.parser.parse_args(self.all_args)
5257
5258
5259 class WebSocketsWrapper():
5260 """Wraps websockets module to use in non-async scopes"""
5261 pool = None
5262
5263 def __init__(self, url, headers=None, connect=True):
5264 self.loop = asyncio.new_event_loop()
5265 # XXX: "loop" is deprecated
5266 self.conn = websockets.connect(
5267 url, extra_headers=headers, ping_interval=None,
5268 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5269 if connect:
5270 self.__enter__()
5271 atexit.register(self.__exit__, None, None, None)
5272
5273 def __enter__(self):
5274 if not self.pool:
5275 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5276 return self
5277
5278 def send(self, *args):
5279 self.run_with_loop(self.pool.send(*args), self.loop)
5280
5281 def recv(self, *args):
5282 return self.run_with_loop(self.pool.recv(*args), self.loop)
5283
5284 def __exit__(self, type, value, traceback):
5285 try:
5286 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5287 finally:
5288 self.loop.close()
5289 self._cancel_all_tasks(self.loop)
5290
5291 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5292 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5293 @staticmethod
5294 def run_with_loop(main, loop):
5295 if not asyncio.iscoroutine(main):
5296 raise ValueError(f'a coroutine was expected, got {main!r}')
5297
5298 try:
5299 return loop.run_until_complete(main)
5300 finally:
5301 loop.run_until_complete(loop.shutdown_asyncgens())
5302 if hasattr(loop, 'shutdown_default_executor'):
5303 loop.run_until_complete(loop.shutdown_default_executor())
5304
5305 @staticmethod
5306 def _cancel_all_tasks(loop):
5307 to_cancel = asyncio.all_tasks(loop)
5308
5309 if not to_cancel:
5310 return
5311
5312 for task in to_cancel:
5313 task.cancel()
5314
5315 # XXX: "loop" is removed in python 3.10+
5316 loop.run_until_complete(
5317 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5318
5319 for task in to_cancel:
5320 if task.cancelled():
5321 continue
5322 if task.exception() is not None:
5323 loop.call_exception_handler({
5324 'message': 'unhandled exception during asyncio.run() shutdown',
5325 'exception': task.exception(),
5326 'task': task,
5327 })
5328
5329
5330 def merge_headers(*dicts):
5331 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5332 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5333
5334
5335 class classproperty:
5336 """classmethod(property(func)) that works in py < 3.9"""
5337
5338 def __init__(self, func):
5339 functools.update_wrapper(self, func)
5340 self.func = func
5341
5342 def __get__(self, _, cls):
5343 return self.func(cls)
5344
5345
5346 class Namespace:
5347 """Immutable namespace"""
5348
5349 def __init__(self, **kwargs):
5350 self._dict = kwargs
5351
5352 def __getattr__(self, attr):
5353 return self._dict[attr]
5354
5355 def __contains__(self, item):
5356 return item in self._dict.values()
5357
5358 def __iter__(self):
5359 return iter(self._dict.items())
5360
5361 def __repr__(self):
5362 return f'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
5363
5364
5365 # Deprecated
5366 has_certifi = bool(certifi)
5367 has_websockets = bool(websockets)