]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[cleanup] Misc fixes
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import ctypes
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import importlib.util
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import mimetypes
24 import operator
25 import os
26 import platform
27 import random
28 import re
29 import shlex
30 import socket
31 import ssl
32 import subprocess
33 import sys
34 import tempfile
35 import time
36 import traceback
37 import urllib.parse
38 import xml.etree.ElementTree
39 import zlib
40
41 from .compat import asyncio, functools # isort: split
42 from .compat import (
43 compat_chr,
44 compat_cookiejar,
45 compat_etree_fromstring,
46 compat_expanduser,
47 compat_html_entities,
48 compat_html_entities_html5,
49 compat_HTMLParseError,
50 compat_HTMLParser,
51 compat_http_client,
52 compat_HTTPError,
53 compat_os_name,
54 compat_parse_qs,
55 compat_shlex_quote,
56 compat_str,
57 compat_struct_pack,
58 compat_struct_unpack,
59 compat_urllib_error,
60 compat_urllib_parse_unquote_plus,
61 compat_urllib_parse_urlencode,
62 compat_urllib_parse_urlparse,
63 compat_urllib_request,
64 compat_urlparse,
65 )
66 from .dependencies import brotli, certifi, websockets
67 from .socks import ProxyType, sockssocket
68
69
70 def register_socks_protocols():
71 # "Register" SOCKS protocols
72 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
73 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
74 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
75 if scheme not in compat_urlparse.uses_netloc:
76 compat_urlparse.uses_netloc.append(scheme)
77
78
79 # This is not clearly defined otherwise
80 compiled_regex_type = type(re.compile(''))
81
82
83 def random_user_agent():
84 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
85 _CHROME_VERSIONS = (
86 '90.0.4430.212',
87 '90.0.4430.24',
88 '90.0.4430.70',
89 '90.0.4430.72',
90 '90.0.4430.85',
91 '90.0.4430.93',
92 '91.0.4472.101',
93 '91.0.4472.106',
94 '91.0.4472.114',
95 '91.0.4472.124',
96 '91.0.4472.164',
97 '91.0.4472.19',
98 '91.0.4472.77',
99 '92.0.4515.107',
100 '92.0.4515.115',
101 '92.0.4515.131',
102 '92.0.4515.159',
103 '92.0.4515.43',
104 '93.0.4556.0',
105 '93.0.4577.15',
106 '93.0.4577.63',
107 '93.0.4577.82',
108 '94.0.4606.41',
109 '94.0.4606.54',
110 '94.0.4606.61',
111 '94.0.4606.71',
112 '94.0.4606.81',
113 '94.0.4606.85',
114 '95.0.4638.17',
115 '95.0.4638.50',
116 '95.0.4638.54',
117 '95.0.4638.69',
118 '95.0.4638.74',
119 '96.0.4664.18',
120 '96.0.4664.45',
121 '96.0.4664.55',
122 '96.0.4664.93',
123 '97.0.4692.20',
124 )
125 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
126
127
128 SUPPORTED_ENCODINGS = [
129 'gzip', 'deflate'
130 ]
131 if brotli:
132 SUPPORTED_ENCODINGS.append('br')
133
134 std_headers = {
135 'User-Agent': random_user_agent(),
136 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
137 'Accept-Language': 'en-us,en;q=0.5',
138 'Sec-Fetch-Mode': 'navigate',
139 }
140
141
142 USER_AGENTS = {
143 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
144 }
145
146
147 NO_DEFAULT = object()
148
149 ENGLISH_MONTH_NAMES = [
150 'January', 'February', 'March', 'April', 'May', 'June',
151 'July', 'August', 'September', 'October', 'November', 'December']
152
153 MONTH_NAMES = {
154 'en': ENGLISH_MONTH_NAMES,
155 'fr': [
156 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
157 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
158 }
159
160 KNOWN_EXTENSIONS = (
161 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
162 'flv', 'f4v', 'f4a', 'f4b',
163 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
164 'mkv', 'mka', 'mk3d',
165 'avi', 'divx',
166 'mov',
167 'asf', 'wmv', 'wma',
168 '3gp', '3g2',
169 'mp3',
170 'flac',
171 'ape',
172 'wav',
173 'f4f', 'f4m', 'm3u8', 'smil')
174
175 # needed for sanitizing filenames in restricted mode
176 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
177 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
178 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
179
180 DATE_FORMATS = (
181 '%d %B %Y',
182 '%d %b %Y',
183 '%B %d %Y',
184 '%B %dst %Y',
185 '%B %dnd %Y',
186 '%B %drd %Y',
187 '%B %dth %Y',
188 '%b %d %Y',
189 '%b %dst %Y',
190 '%b %dnd %Y',
191 '%b %drd %Y',
192 '%b %dth %Y',
193 '%b %dst %Y %I:%M',
194 '%b %dnd %Y %I:%M',
195 '%b %drd %Y %I:%M',
196 '%b %dth %Y %I:%M',
197 '%Y %m %d',
198 '%Y-%m-%d',
199 '%Y.%m.%d.',
200 '%Y/%m/%d',
201 '%Y/%m/%d %H:%M',
202 '%Y/%m/%d %H:%M:%S',
203 '%Y%m%d%H%M',
204 '%Y%m%d%H%M%S',
205 '%Y%m%d',
206 '%Y-%m-%d %H:%M',
207 '%Y-%m-%d %H:%M:%S',
208 '%Y-%m-%d %H:%M:%S.%f',
209 '%Y-%m-%d %H:%M:%S:%f',
210 '%d.%m.%Y %H:%M',
211 '%d.%m.%Y %H.%M',
212 '%Y-%m-%dT%H:%M:%SZ',
213 '%Y-%m-%dT%H:%M:%S.%fZ',
214 '%Y-%m-%dT%H:%M:%S.%f0Z',
215 '%Y-%m-%dT%H:%M:%S',
216 '%Y-%m-%dT%H:%M:%S.%f',
217 '%Y-%m-%dT%H:%M',
218 '%b %d %Y at %H:%M',
219 '%b %d %Y at %H:%M:%S',
220 '%B %d %Y at %H:%M',
221 '%B %d %Y at %H:%M:%S',
222 '%H:%M %d-%b-%Y',
223 )
224
225 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
226 DATE_FORMATS_DAY_FIRST.extend([
227 '%d-%m-%Y',
228 '%d.%m.%Y',
229 '%d.%m.%y',
230 '%d/%m/%Y',
231 '%d/%m/%y',
232 '%d/%m/%Y %H:%M:%S',
233 ])
234
235 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
236 DATE_FORMATS_MONTH_FIRST.extend([
237 '%m-%d-%Y',
238 '%m.%d.%Y',
239 '%m/%d/%Y',
240 '%m/%d/%y',
241 '%m/%d/%Y %H:%M:%S',
242 ])
243
244 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
245 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
246
247 NUMBER_RE = r'\d+(?:\.\d+)?'
248
249
250 @functools.cache
251 def preferredencoding():
252 """Get preferred encoding.
253
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
256 """
257 try:
258 pref = locale.getpreferredencoding()
259 'TEST'.encode(pref)
260 except Exception:
261 pref = 'UTF-8'
262
263 return pref
264
265
266 def write_json_file(obj, fn):
267 """ Encode obj as JSON and write it to fn, atomically if possible """
268
269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
271 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
272
273 try:
274 with tf:
275 json.dump(obj, tf, ensure_ascii=False)
276 if sys.platform == 'win32':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
279 with contextlib.suppress(OSError):
280 os.unlink(fn)
281 with contextlib.suppress(OSError):
282 mask = os.umask(0)
283 os.umask(mask)
284 os.chmod(tf.name, 0o666 & ~mask)
285 os.rename(tf.name, fn)
286 except Exception:
287 with contextlib.suppress(OSError):
288 os.remove(tf.name)
289 raise
290
291
292 def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^[a-zA-Z_-]+$', key)
295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
296 return node.find(expr)
297
298 # On python2.6 the xml.etree.ElementTree.Element methods don't support
299 # the namespace parameter
300
301
302 def xpath_with_ns(path, ns_map):
303 components = [c.split(':') for c in path.split('/')]
304 replaced = []
305 for c in components:
306 if len(c) == 1:
307 replaced.append(c[0])
308 else:
309 ns, tag = c
310 replaced.append('{%s}%s' % (ns_map[ns], tag))
311 return '/'.join(replaced)
312
313
314 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
315 def _find_xpath(xpath):
316 return node.find(xpath)
317
318 if isinstance(xpath, (str, compat_str)):
319 n = _find_xpath(xpath)
320 else:
321 for xp in xpath:
322 n = _find_xpath(xp)
323 if n is not None:
324 break
325
326 if n is None:
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
330 name = xpath if name is None else name
331 raise ExtractorError('Could not find XML element %s' % name)
332 else:
333 return None
334 return n
335
336
337 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
338 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
339 if n is None or n == default:
340 return n
341 if n.text is None:
342 if default is not NO_DEFAULT:
343 return default
344 elif fatal:
345 name = xpath if name is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name)
347 else:
348 return None
349 return n.text
350
351
352 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
353 n = find_xpath_attr(node, xpath, key)
354 if n is None:
355 if default is not NO_DEFAULT:
356 return default
357 elif fatal:
358 name = f'{xpath}[@{key}]' if name is None else name
359 raise ExtractorError('Could not find XML attribute %s' % name)
360 else:
361 return None
362 return n.attrib[key]
363
364
365 def get_element_by_id(id, html, **kwargs):
366 """Return the content of the tag with the specified ID in the passed HTML document"""
367 return get_element_by_attribute('id', id, html, **kwargs)
368
369
370 def get_element_html_by_id(id, html, **kwargs):
371 """Return the html of the tag with the specified ID in the passed HTML document"""
372 return get_element_html_by_attribute('id', id, html, **kwargs)
373
374
375 def get_element_by_class(class_name, html):
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
381 def get_element_html_by_class(class_name, html):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval = get_elements_html_by_class(class_name, html)
384 return retval[0] if retval else None
385
386
387 def get_element_by_attribute(attribute, value, html, **kwargs):
388 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
389 return retval[0] if retval else None
390
391
392 def get_element_html_by_attribute(attribute, value, html, **kargs):
393 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
394 return retval[0] if retval else None
395
396
397 def get_elements_by_class(class_name, html, **kargs):
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
400 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
402
403
404 def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
409
410
411 def get_elements_by_attribute(*args, **kwargs):
412 """Return the content of the tag with the specified attribute in the passed HTML document"""
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416 def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
419
420
421 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
422 """
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
425 """
426
427 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
428
429 value = re.escape(value) if escape_value else value
430
431 partial_element_re = rf'''(?x)
432 <(?P<tag>[a-zA-Z0-9:._-]+)
433 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
435 '''
436
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
439
440 yield (
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
442 whole
443 )
444
445
446 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
447 """
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
450 as a context manager
451 """
452
453 class HTMLBreakOnClosingTagException(Exception):
454 pass
455
456 def __init__(self):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
459
460 def __enter__(self):
461 return self
462
463 def __exit__(self, *_):
464 self.close()
465
466 def close(self):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
470 pass
471
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
474
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags in the stack')
478 while self.tagstack:
479 inner_tag = self.tagstack.pop()
480 if inner_tag == tag:
481 break
482 else:
483 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
486
487
488 def get_element_text_and_html_by_tag(tag, html):
489 """
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text) and the whole element (html)
492 """
493 def find_or_raise(haystack, needle, exc):
494 try:
495 return haystack.index(needle)
496 except ValueError:
497 raise exc
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
514 try:
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
521
522
523 class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes for a single element"""
525
526 def __init__(self):
527 self.attrs = {}
528 compat_HTMLParser.__init__(self)
529
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
532
533
534 class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes for the elements of a list"""
536
537 def __init__(self):
538 compat_HTMLParser.__init__(self)
539 self.items = []
540 self._level = 0
541
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
545 self._level += 1
546
547 def handle_endtag(self, tag):
548 self._level -= 1
549
550
551 def extract_attributes(html_element):
552 """Given a string for an HTML element such as
553 <el
554 a="foo" B="bar" c="&98;az" d=boz
555 empty= noval entity="&amp;"
556 sq='"' dq="'"
557 >
558 Decode and return a dictionary of attributes.
559 {
560 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
563 }.
564 """
565 parser = HTMLAttributeParser()
566 with contextlib.suppress(compat_HTMLParseError):
567 parser.feed(html_element)
568 parser.close()
569 return parser.attrs
570
571
572 def parse_list(webpage):
573 """Given a string for an series of HTML <li> elements,
574 return a dictionary of their attributes"""
575 parser = HTMLListAttrsParser()
576 parser.feed(webpage)
577 parser.close()
578 return parser.items
579
580
581 def clean_html(html):
582 """Clean an HTML snippet into a readable string"""
583
584 if html is None: # Convenience for sanitizing descriptions etc.
585 return html
586
587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
590 # Strip html tags
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
594 return html.strip()
595
596
597 def sanitize_open(filename, open_mode):
598 """Try to open the given filename, and slightly tweak it if this fails.
599
600 Attempts to open the given filename. If this fails, it tries to change
601 the filename slightly, step by step, until it's either able to open it
602 or it fails and raises a final exception, like the standard open()
603 function.
604
605 It returns the tuple (stream, definitive_file_name).
606 """
607 if filename == '-':
608 if sys.platform == 'win32':
609 import msvcrt
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
612
613 for attempt in range(2):
614 try:
615 try:
616 if sys.platform == 'win32':
617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the file on windows (for now).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
620 raise LockingUnsupportedError()
621 stream = locked_file(filename, open_mode, block=False).__enter__()
622 except OSError:
623 stream = open(filename, open_mode)
624 return stream, filename
625 except OSError as err:
626 if attempt or err.errno in (errno.EACCES,):
627 raise
628 old_filename, filename = filename, sanitize_path(filename)
629 if old_filename == filename:
630 raise
631
632
633 def timeconvert(timestr):
634 """Convert RFC 2822 defined time string into system timestamp"""
635 timestamp = None
636 timetuple = email.utils.parsedate_tz(timestr)
637 if timetuple is not None:
638 timestamp = email.utils.mktime_tz(timetuple)
639 return timestamp
640
641
642 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
643 """Sanitizes a string so it could be used as part of a filename.
644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
647 """
648 if s == '':
649 return ''
650
651 def replace_insane(char):
652 if restricted and char in ACCENT_CHARS:
653 return ACCENT_CHARS[char]
654 elif not restricted and char == '\n':
655 return '\0 '
656 elif char == '?' or ord(char) < 32 or ord(char) == 127:
657 return ''
658 elif char == '"':
659 return '' if restricted else '\''
660 elif char == ':':
661 return '\0_\0-' if restricted else '\0 \0-'
662 elif char in '\\/|*<>':
663 return '\0_'
664 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
665 return '\0_'
666 return char
667
668 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
669 result = ''.join(map(replace_insane, s))
670 if is_id is NO_DEFAULT:
671 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
672 STRIP_RE = '(?:\0.|[ _-])*'
673 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
674 result = result.replace('\0', '') or '_'
675
676 if not is_id:
677 while '__' in result:
678 result = result.replace('__', '_')
679 result = result.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted and result.startswith('-_'):
682 result = result[2:]
683 if result.startswith('-'):
684 result = '_' + result[len('-'):]
685 result = result.lstrip('.')
686 if not result:
687 result = '_'
688 return result
689
690
691 def sanitize_path(s, force=False):
692 """Sanitizes and normalizes path on Windows"""
693 if sys.platform == 'win32':
694 force = False
695 drive_or_unc, _ = os.path.splitdrive(s)
696 elif force:
697 drive_or_unc = ''
698 else:
699 return s
700
701 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
702 if drive_or_unc:
703 norm_path.pop(0)
704 sanitized_path = [
705 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
706 for path_part in norm_path]
707 if drive_or_unc:
708 sanitized_path.insert(0, drive_or_unc + os.path.sep)
709 elif force and s and s[0] == os.path.sep:
710 sanitized_path.insert(0, os.path.sep)
711 return os.path.join(*sanitized_path)
712
713
714 def sanitize_url(url):
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
717 if url is None:
718 return
719 elif url.startswith('//'):
720 return 'http:%s' % url
721 # Fix some common typos seen so far
722 COMMON_TYPOS = (
723 # https://github.com/ytdl-org/youtube-dl/issues/15649
724 (r'^httpss://', r'https://'),
725 # https://bx1.be/lives/direct-tv/
726 (r'^rmtp([es]?)://', r'rtmp\1://'),
727 )
728 for mistake, fixup in COMMON_TYPOS:
729 if re.match(mistake, url):
730 return re.sub(mistake, fixup, url)
731 return url
732
733
734 def extract_basic_auth(url):
735 parts = compat_urlparse.urlsplit(url)
736 if parts.username is None:
737 return url, None
738 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
739 parts.hostname if parts.port is None
740 else '%s:%d' % (parts.hostname, parts.port))))
741 auth_payload = base64.b64encode(
742 ('%s:%s' % (parts.username, parts.password or '')).encode())
743 return url, f'Basic {auth_payload.decode()}'
744
745
746 def sanitized_Request(url, *args, **kwargs):
747 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
748 if auth_header is not None:
749 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
750 headers['Authorization'] = auth_header
751 return compat_urllib_request.Request(url, *args, **kwargs)
752
753
754 def expand_path(s):
755 """Expand shell variables and ~"""
756 return os.path.expandvars(compat_expanduser(s))
757
758
759 def orderedSet(iterable):
760 """ Remove all duplicates from the input iterable """
761 res = []
762 for el in iterable:
763 if el not in res:
764 res.append(el)
765 return res
766
767
768 def _htmlentity_transform(entity_with_semicolon):
769 """Transforms an HTML entity to a character."""
770 entity = entity_with_semicolon[:-1]
771
772 # Known non-numeric HTML entity
773 if entity in compat_html_entities.name2codepoint:
774 return compat_chr(compat_html_entities.name2codepoint[entity])
775
776 # TODO: HTML5 allows entities without a semicolon. For example,
777 # '&Eacuteric' should be decoded as 'Éric'.
778 if entity_with_semicolon in compat_html_entities_html5:
779 return compat_html_entities_html5[entity_with_semicolon]
780
781 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
782 if mobj is not None:
783 numstr = mobj.group(1)
784 if numstr.startswith('x'):
785 base = 16
786 numstr = '0%s' % numstr
787 else:
788 base = 10
789 # See https://github.com/ytdl-org/youtube-dl/issues/7518
790 with contextlib.suppress(ValueError):
791 return compat_chr(int(numstr, base))
792
793 # Unknown entity in name, return its literal representation
794 return '&%s;' % entity
795
796
797 def unescapeHTML(s):
798 if s is None:
799 return None
800 assert isinstance(s, str)
801
802 return re.sub(
803 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
804
805
806 def escapeHTML(text):
807 return (
808 text
809 .replace('&', '&amp;')
810 .replace('<', '&lt;')
811 .replace('>', '&gt;')
812 .replace('"', '&quot;')
813 .replace("'", '&#39;')
814 )
815
816
817 def process_communicate_or_kill(p, *args, **kwargs):
818 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
819 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
820 return Popen.communicate_or_kill(p, *args, **kwargs)
821
822
823 class Popen(subprocess.Popen):
824 if sys.platform == 'win32':
825 _startupinfo = subprocess.STARTUPINFO()
826 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
827 else:
828 _startupinfo = None
829
830 def __init__(self, *args, **kwargs):
831 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
832
833 def communicate_or_kill(self, *args, **kwargs):
834 try:
835 return self.communicate(*args, **kwargs)
836 except BaseException: # Including KeyboardInterrupt
837 self.kill()
838 self.wait()
839 raise
840
841
842 def get_subprocess_encoding():
843 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
844 # For subprocess calls, encode with locale encoding
845 # Refer to http://stackoverflow.com/a/9951851/35070
846 encoding = preferredencoding()
847 else:
848 encoding = sys.getfilesystemencoding()
849 if encoding is None:
850 encoding = 'utf-8'
851 return encoding
852
853
854 def encodeFilename(s, for_subprocess=False):
855 assert isinstance(s, str)
856 return s
857
858
859 def decodeFilename(b, for_subprocess=False):
860 return b
861
862
863 def encodeArgument(s):
864 # Legacy code that uses byte strings
865 # Uncomment the following line after fixing all post processors
866 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
867 return s if isinstance(s, str) else s.decode('ascii')
868
869
870 def decodeArgument(b):
871 return b
872
873
874 def decodeOption(optval):
875 if optval is None:
876 return optval
877 if isinstance(optval, bytes):
878 optval = optval.decode(preferredencoding())
879
880 assert isinstance(optval, compat_str)
881 return optval
882
883
884 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
885
886
887 def timetuple_from_msec(msec):
888 secs, msec = divmod(msec, 1000)
889 mins, secs = divmod(secs, 60)
890 hrs, mins = divmod(mins, 60)
891 return _timetuple(hrs, mins, secs, msec)
892
893
894 def formatSeconds(secs, delim=':', msec=False):
895 time = timetuple_from_msec(secs * 1000)
896 if time.hours:
897 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
898 elif time.minutes:
899 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
900 else:
901 ret = '%d' % time.seconds
902 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
903
904
905 def _ssl_load_windows_store_certs(ssl_context, storename):
906 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
907 try:
908 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
909 if encoding == 'x509_asn' and (
910 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
911 except PermissionError:
912 return
913 for cert in certs:
914 with contextlib.suppress(ssl.SSLError):
915 ssl_context.load_verify_locations(cadata=cert)
916
917
918 def make_HTTPS_handler(params, **kwargs):
919 opts_check_certificate = not params.get('nocheckcertificate')
920 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
921 context.check_hostname = opts_check_certificate
922 if params.get('legacyserverconnect'):
923 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
924 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
925 context.set_ciphers('DEFAULT')
926
927 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
928 if opts_check_certificate:
929 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
930 context.load_verify_locations(cafile=certifi.where())
931 try:
932 context.load_default_certs()
933 # Work around the issue in load_default_certs when there are bad certificates. See:
934 # https://github.com/yt-dlp/yt-dlp/issues/1060,
935 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
936 except ssl.SSLError:
937 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
938 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
939 for storename in ('CA', 'ROOT'):
940 _ssl_load_windows_store_certs(context, storename)
941 context.set_default_verify_paths()
942
943 client_certfile = params.get('client_certificate')
944 if client_certfile:
945 try:
946 context.load_cert_chain(
947 client_certfile, keyfile=params.get('client_certificate_key'),
948 password=params.get('client_certificate_password'))
949 except ssl.SSLError:
950 raise YoutubeDLError('Unable to load client certificate')
951
952 # Some servers may reject requests if ALPN extension is not sent. See:
953 # https://github.com/python/cpython/issues/85140
954 # https://github.com/yt-dlp/yt-dlp/issues/3878
955 with contextlib.suppress(NotImplementedError):
956 context.set_alpn_protocols(['http/1.1'])
957
958 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
959
960
961 def bug_reports_message(before=';'):
962 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
963 'filling out the appropriate issue template. '
964 'Confirm you are on the latest version using yt-dlp -U')
965
966 before = before.rstrip()
967 if not before or before.endswith(('.', '!', '?')):
968 msg = msg[0].title() + msg[1:]
969
970 return (before + ' ' if before else '') + msg
971
972
973 class YoutubeDLError(Exception):
974 """Base exception for YoutubeDL errors."""
975 msg = None
976
977 def __init__(self, msg=None):
978 if msg is not None:
979 self.msg = msg
980 elif self.msg is None:
981 self.msg = type(self).__name__
982 super().__init__(self.msg)
983
984
985 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
986 if hasattr(ssl, 'CertificateError'):
987 network_exceptions.append(ssl.CertificateError)
988 network_exceptions = tuple(network_exceptions)
989
990
991 class ExtractorError(YoutubeDLError):
992 """Error during info extraction."""
993
994 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
995 """ tb, if given, is the original traceback (so that it can be printed out).
996 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
997 """
998 if sys.exc_info()[0] in network_exceptions:
999 expected = True
1000
1001 self.orig_msg = str(msg)
1002 self.traceback = tb
1003 self.expected = expected
1004 self.cause = cause
1005 self.video_id = video_id
1006 self.ie = ie
1007 self.exc_info = sys.exc_info() # preserve original exception
1008
1009 super().__init__(''.join((
1010 format_field(ie, template='[%s] '),
1011 format_field(video_id, template='%s: '),
1012 msg,
1013 format_field(cause, template=' (caused by %r)'),
1014 '' if expected else bug_reports_message())))
1015
1016 def format_traceback(self):
1017 return join_nonempty(
1018 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1019 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1020 delim='\n') or None
1021
1022
1023 class UnsupportedError(ExtractorError):
1024 def __init__(self, url):
1025 super().__init__(
1026 'Unsupported URL: %s' % url, expected=True)
1027 self.url = url
1028
1029
1030 class RegexNotFoundError(ExtractorError):
1031 """Error when a regex didn't match"""
1032 pass
1033
1034
1035 class GeoRestrictedError(ExtractorError):
1036 """Geographic restriction Error exception.
1037
1038 This exception may be thrown when a video is not available from your
1039 geographic location due to geographic restrictions imposed by a website.
1040 """
1041
1042 def __init__(self, msg, countries=None, **kwargs):
1043 kwargs['expected'] = True
1044 super().__init__(msg, **kwargs)
1045 self.countries = countries
1046
1047
1048 class DownloadError(YoutubeDLError):
1049 """Download Error exception.
1050
1051 This exception may be thrown by FileDownloader objects if they are not
1052 configured to continue on errors. They will contain the appropriate
1053 error message.
1054 """
1055
1056 def __init__(self, msg, exc_info=None):
1057 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1058 super().__init__(msg)
1059 self.exc_info = exc_info
1060
1061
1062 class EntryNotInPlaylist(YoutubeDLError):
1063 """Entry not in playlist exception.
1064
1065 This exception will be thrown by YoutubeDL when a requested entry
1066 is not found in the playlist info_dict
1067 """
1068 msg = 'Entry not found in info'
1069
1070
1071 class SameFileError(YoutubeDLError):
1072 """Same File exception.
1073
1074 This exception will be thrown by FileDownloader objects if they detect
1075 multiple files would have to be downloaded to the same file on disk.
1076 """
1077 msg = 'Fixed output name but more than one file to download'
1078
1079 def __init__(self, filename=None):
1080 if filename is not None:
1081 self.msg += f': {filename}'
1082 super().__init__(self.msg)
1083
1084
1085 class PostProcessingError(YoutubeDLError):
1086 """Post Processing exception.
1087
1088 This exception may be raised by PostProcessor's .run() method to
1089 indicate an error in the postprocessing task.
1090 """
1091
1092
1093 class DownloadCancelled(YoutubeDLError):
1094 """ Exception raised when the download queue should be interrupted """
1095 msg = 'The download was cancelled'
1096
1097
1098 class ExistingVideoReached(DownloadCancelled):
1099 """ --break-on-existing triggered """
1100 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1101
1102
1103 class RejectedVideoReached(DownloadCancelled):
1104 """ --break-on-reject triggered """
1105 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1106
1107
1108 class MaxDownloadsReached(DownloadCancelled):
1109 """ --max-downloads limit has been reached. """
1110 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1111
1112
1113 class ReExtractInfo(YoutubeDLError):
1114 """ Video info needs to be re-extracted. """
1115
1116 def __init__(self, msg, expected=False):
1117 super().__init__(msg)
1118 self.expected = expected
1119
1120
1121 class ThrottledDownload(ReExtractInfo):
1122 """ Download speed below --throttled-rate. """
1123 msg = 'The download speed is below throttle limit'
1124
1125 def __init__(self):
1126 super().__init__(self.msg, expected=False)
1127
1128
1129 class UnavailableVideoError(YoutubeDLError):
1130 """Unavailable Format exception.
1131
1132 This exception will be thrown when a video is requested
1133 in a format that is not available for that video.
1134 """
1135 msg = 'Unable to download video'
1136
1137 def __init__(self, err=None):
1138 if err is not None:
1139 self.msg += f': {err}'
1140 super().__init__(self.msg)
1141
1142
1143 class ContentTooShortError(YoutubeDLError):
1144 """Content Too Short exception.
1145
1146 This exception may be raised by FileDownloader objects when a file they
1147 download is too small for what the server announced first, indicating
1148 the connection was probably interrupted.
1149 """
1150
1151 def __init__(self, downloaded, expected):
1152 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1153 # Both in bytes
1154 self.downloaded = downloaded
1155 self.expected = expected
1156
1157
1158 class XAttrMetadataError(YoutubeDLError):
1159 def __init__(self, code=None, msg='Unknown error'):
1160 super().__init__(msg)
1161 self.code = code
1162 self.msg = msg
1163
1164 # Parsing code and msg
1165 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1166 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1167 self.reason = 'NO_SPACE'
1168 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1169 self.reason = 'VALUE_TOO_LONG'
1170 else:
1171 self.reason = 'NOT_SUPPORTED'
1172
1173
1174 class XAttrUnavailableError(YoutubeDLError):
1175 pass
1176
1177
1178 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1179 hc = http_class(*args, **kwargs)
1180 source_address = ydl_handler._params.get('source_address')
1181
1182 if source_address is not None:
1183 # This is to workaround _create_connection() from socket where it will try all
1184 # address data from getaddrinfo() including IPv6. This filters the result from
1185 # getaddrinfo() based on the source_address value.
1186 # This is based on the cpython socket.create_connection() function.
1187 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1188 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1189 host, port = address
1190 err = None
1191 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1192 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1193 ip_addrs = [addr for addr in addrs if addr[0] == af]
1194 if addrs and not ip_addrs:
1195 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1196 raise OSError(
1197 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1198 % (ip_version, source_address[0]))
1199 for res in ip_addrs:
1200 af, socktype, proto, canonname, sa = res
1201 sock = None
1202 try:
1203 sock = socket.socket(af, socktype, proto)
1204 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1205 sock.settimeout(timeout)
1206 sock.bind(source_address)
1207 sock.connect(sa)
1208 err = None # Explicitly break reference cycle
1209 return sock
1210 except OSError as _:
1211 err = _
1212 if sock is not None:
1213 sock.close()
1214 if err is not None:
1215 raise err
1216 else:
1217 raise OSError('getaddrinfo returns an empty list')
1218 if hasattr(hc, '_create_connection'):
1219 hc._create_connection = _create_connection
1220 hc.source_address = (source_address, 0)
1221
1222 return hc
1223
1224
1225 def handle_youtubedl_headers(headers):
1226 filtered_headers = headers
1227
1228 if 'Youtubedl-no-compression' in filtered_headers:
1229 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1230 del filtered_headers['Youtubedl-no-compression']
1231
1232 return filtered_headers
1233
1234
1235 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1236 """Handler for HTTP requests and responses.
1237
1238 This class, when installed with an OpenerDirector, automatically adds
1239 the standard headers to every HTTP request and handles gzipped and
1240 deflated responses from web servers. If compression is to be avoided in
1241 a particular request, the original request in the program code only has
1242 to include the HTTP header "Youtubedl-no-compression", which will be
1243 removed before making the real request.
1244
1245 Part of this code was copied from:
1246
1247 http://techknack.net/python-urllib2-handlers/
1248
1249 Andrew Rowls, the author of that code, agreed to release it to the
1250 public domain.
1251 """
1252
1253 def __init__(self, params, *args, **kwargs):
1254 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1255 self._params = params
1256
1257 def http_open(self, req):
1258 conn_class = compat_http_client.HTTPConnection
1259
1260 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1261 if socks_proxy:
1262 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1263 del req.headers['Ytdl-socks-proxy']
1264
1265 return self.do_open(functools.partial(
1266 _create_http_connection, self, conn_class, False),
1267 req)
1268
1269 @staticmethod
1270 def deflate(data):
1271 if not data:
1272 return data
1273 try:
1274 return zlib.decompress(data, -zlib.MAX_WBITS)
1275 except zlib.error:
1276 return zlib.decompress(data)
1277
1278 @staticmethod
1279 def brotli(data):
1280 if not data:
1281 return data
1282 return brotli.decompress(data)
1283
1284 def http_request(self, req):
1285 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1286 # always respected by websites, some tend to give out URLs with non percent-encoded
1287 # non-ASCII characters (see telemb.py, ard.py [#3412])
1288 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1289 # To work around aforementioned issue we will replace request's original URL with
1290 # percent-encoded one
1291 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1292 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1293 url = req.get_full_url()
1294 url_escaped = escape_url(url)
1295
1296 # Substitute URL if any change after escaping
1297 if url != url_escaped:
1298 req = update_Request(req, url=url_escaped)
1299
1300 for h, v in self._params.get('http_headers', std_headers).items():
1301 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1302 # The dict keys are capitalized because of this bug by urllib
1303 if h.capitalize() not in req.headers:
1304 req.add_header(h, v)
1305
1306 if 'Accept-encoding' not in req.headers:
1307 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1308
1309 req.headers = handle_youtubedl_headers(req.headers)
1310
1311 return req
1312
1313 def http_response(self, req, resp):
1314 old_resp = resp
1315 # gzip
1316 if resp.headers.get('Content-encoding', '') == 'gzip':
1317 content = resp.read()
1318 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1319 try:
1320 uncompressed = io.BytesIO(gz.read())
1321 except OSError as original_ioerror:
1322 # There may be junk add the end of the file
1323 # See http://stackoverflow.com/q/4928560/35070 for details
1324 for i in range(1, 1024):
1325 try:
1326 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1327 uncompressed = io.BytesIO(gz.read())
1328 except OSError:
1329 continue
1330 break
1331 else:
1332 raise original_ioerror
1333 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1334 resp.msg = old_resp.msg
1335 del resp.headers['Content-encoding']
1336 # deflate
1337 if resp.headers.get('Content-encoding', '') == 'deflate':
1338 gz = io.BytesIO(self.deflate(resp.read()))
1339 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1340 resp.msg = old_resp.msg
1341 del resp.headers['Content-encoding']
1342 # brotli
1343 if resp.headers.get('Content-encoding', '') == 'br':
1344 resp = compat_urllib_request.addinfourl(
1345 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1346 resp.msg = old_resp.msg
1347 del resp.headers['Content-encoding']
1348 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1349 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1350 if 300 <= resp.code < 400:
1351 location = resp.headers.get('Location')
1352 if location:
1353 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1354 location = location.encode('iso-8859-1').decode()
1355 location_escaped = escape_url(location)
1356 if location != location_escaped:
1357 del resp.headers['Location']
1358 resp.headers['Location'] = location_escaped
1359 return resp
1360
1361 https_request = http_request
1362 https_response = http_response
1363
1364
1365 def make_socks_conn_class(base_class, socks_proxy):
1366 assert issubclass(base_class, (
1367 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1368
1369 url_components = compat_urlparse.urlparse(socks_proxy)
1370 if url_components.scheme.lower() == 'socks5':
1371 socks_type = ProxyType.SOCKS5
1372 elif url_components.scheme.lower() in ('socks', 'socks4'):
1373 socks_type = ProxyType.SOCKS4
1374 elif url_components.scheme.lower() == 'socks4a':
1375 socks_type = ProxyType.SOCKS4A
1376
1377 def unquote_if_non_empty(s):
1378 if not s:
1379 return s
1380 return compat_urllib_parse_unquote_plus(s)
1381
1382 proxy_args = (
1383 socks_type,
1384 url_components.hostname, url_components.port or 1080,
1385 True, # Remote DNS
1386 unquote_if_non_empty(url_components.username),
1387 unquote_if_non_empty(url_components.password),
1388 )
1389
1390 class SocksConnection(base_class):
1391 def connect(self):
1392 self.sock = sockssocket()
1393 self.sock.setproxy(*proxy_args)
1394 if isinstance(self.timeout, (int, float)):
1395 self.sock.settimeout(self.timeout)
1396 self.sock.connect((self.host, self.port))
1397
1398 if isinstance(self, compat_http_client.HTTPSConnection):
1399 if hasattr(self, '_context'): # Python > 2.6
1400 self.sock = self._context.wrap_socket(
1401 self.sock, server_hostname=self.host)
1402 else:
1403 self.sock = ssl.wrap_socket(self.sock)
1404
1405 return SocksConnection
1406
1407
1408 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1409 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1410 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1411 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1412 self._params = params
1413
1414 def https_open(self, req):
1415 kwargs = {}
1416 conn_class = self._https_conn_class
1417
1418 if hasattr(self, '_context'): # python > 2.6
1419 kwargs['context'] = self._context
1420 if hasattr(self, '_check_hostname'): # python 3.x
1421 kwargs['check_hostname'] = self._check_hostname
1422
1423 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1424 if socks_proxy:
1425 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1426 del req.headers['Ytdl-socks-proxy']
1427
1428 try:
1429 return self.do_open(
1430 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1431 except urllib.error.URLError as e:
1432 if (isinstance(e.reason, ssl.SSLError)
1433 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1434 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1435 raise
1436
1437
1438 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1439 """
1440 See [1] for cookie file format.
1441
1442 1. https://curl.haxx.se/docs/http-cookies.html
1443 """
1444 _HTTPONLY_PREFIX = '#HttpOnly_'
1445 _ENTRY_LEN = 7
1446 _HEADER = '''# Netscape HTTP Cookie File
1447 # This file is generated by yt-dlp. Do not edit.
1448
1449 '''
1450 _CookieFileEntry = collections.namedtuple(
1451 'CookieFileEntry',
1452 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1453
1454 def __init__(self, filename=None, *args, **kwargs):
1455 super().__init__(None, *args, **kwargs)
1456 if self.is_path(filename):
1457 filename = os.fspath(filename)
1458 self.filename = filename
1459
1460 @staticmethod
1461 def _true_or_false(cndn):
1462 return 'TRUE' if cndn else 'FALSE'
1463
1464 @staticmethod
1465 def is_path(file):
1466 return isinstance(file, (str, bytes, os.PathLike))
1467
1468 @contextlib.contextmanager
1469 def open(self, file, *, write=False):
1470 if self.is_path(file):
1471 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1472 yield f
1473 else:
1474 if write:
1475 file.truncate(0)
1476 yield file
1477
1478 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1479 now = time.time()
1480 for cookie in self:
1481 if (not ignore_discard and cookie.discard
1482 or not ignore_expires and cookie.is_expired(now)):
1483 continue
1484 name, value = cookie.name, cookie.value
1485 if value is None:
1486 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1487 # with no name, whereas http.cookiejar regards it as a
1488 # cookie with no value.
1489 name, value = '', name
1490 f.write('%s\n' % '\t'.join((
1491 cookie.domain,
1492 self._true_or_false(cookie.domain.startswith('.')),
1493 cookie.path,
1494 self._true_or_false(cookie.secure),
1495 str_or_none(cookie.expires, default=''),
1496 name, value
1497 )))
1498
1499 def save(self, filename=None, *args, **kwargs):
1500 """
1501 Save cookies to a file.
1502 Code is taken from CPython 3.6
1503 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1504
1505 if filename is None:
1506 if self.filename is not None:
1507 filename = self.filename
1508 else:
1509 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1510
1511 # Store session cookies with `expires` set to 0 instead of an empty string
1512 for cookie in self:
1513 if cookie.expires is None:
1514 cookie.expires = 0
1515
1516 with self.open(filename, write=True) as f:
1517 f.write(self._HEADER)
1518 self._really_save(f, *args, **kwargs)
1519
1520 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1521 """Load cookies from a file."""
1522 if filename is None:
1523 if self.filename is not None:
1524 filename = self.filename
1525 else:
1526 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1527
1528 def prepare_line(line):
1529 if line.startswith(self._HTTPONLY_PREFIX):
1530 line = line[len(self._HTTPONLY_PREFIX):]
1531 # comments and empty lines are fine
1532 if line.startswith('#') or not line.strip():
1533 return line
1534 cookie_list = line.split('\t')
1535 if len(cookie_list) != self._ENTRY_LEN:
1536 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1537 cookie = self._CookieFileEntry(*cookie_list)
1538 if cookie.expires_at and not cookie.expires_at.isdigit():
1539 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1540 return line
1541
1542 cf = io.StringIO()
1543 with self.open(filename) as f:
1544 for line in f:
1545 try:
1546 cf.write(prepare_line(line))
1547 except compat_cookiejar.LoadError as e:
1548 if f'{line.strip()} '[0] in '[{"':
1549 raise compat_cookiejar.LoadError(
1550 'Cookies file must be Netscape formatted, not JSON. See '
1551 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1552 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1553 continue
1554 cf.seek(0)
1555 self._really_load(cf, filename, ignore_discard, ignore_expires)
1556 # Session cookies are denoted by either `expires` field set to
1557 # an empty string or 0. MozillaCookieJar only recognizes the former
1558 # (see [1]). So we need force the latter to be recognized as session
1559 # cookies on our own.
1560 # Session cookies may be important for cookies-based authentication,
1561 # e.g. usually, when user does not check 'Remember me' check box while
1562 # logging in on a site, some important cookies are stored as session
1563 # cookies so that not recognizing them will result in failed login.
1564 # 1. https://bugs.python.org/issue17164
1565 for cookie in self:
1566 # Treat `expires=0` cookies as session cookies
1567 if cookie.expires == 0:
1568 cookie.expires = None
1569 cookie.discard = True
1570
1571
1572 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1573 def __init__(self, cookiejar=None):
1574 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1575
1576 def http_response(self, request, response):
1577 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1578
1579 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1580 https_response = http_response
1581
1582
1583 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1584 """YoutubeDL redirect handler
1585
1586 The code is based on HTTPRedirectHandler implementation from CPython [1].
1587
1588 This redirect handler solves two issues:
1589 - ensures redirect URL is always unicode under python 2
1590 - introduces support for experimental HTTP response status code
1591 308 Permanent Redirect [2] used by some sites [3]
1592
1593 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1594 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1595 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1596 """
1597
1598 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1599
1600 def redirect_request(self, req, fp, code, msg, headers, newurl):
1601 """Return a Request or None in response to a redirect.
1602
1603 This is called by the http_error_30x methods when a
1604 redirection response is received. If a redirection should
1605 take place, return a new Request to allow http_error_30x to
1606 perform the redirect. Otherwise, raise HTTPError if no-one
1607 else should try to handle this url. Return None if you can't
1608 but another Handler might.
1609 """
1610 m = req.get_method()
1611 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1612 or code in (301, 302, 303) and m == "POST")):
1613 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1614 # Strictly (according to RFC 2616), 301 or 302 in response to
1615 # a POST MUST NOT cause a redirection without confirmation
1616 # from the user (of urllib.request, in this case). In practice,
1617 # essentially all clients do redirect in this case, so we do
1618 # the same.
1619
1620 # Be conciliant with URIs containing a space. This is mainly
1621 # redundant with the more complete encoding done in http_error_302(),
1622 # but it is kept for compatibility with other callers.
1623 newurl = newurl.replace(' ', '%20')
1624
1625 CONTENT_HEADERS = ("content-length", "content-type")
1626 # NB: don't use dict comprehension for python 2.6 compatibility
1627 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1628
1629 # A 303 must either use GET or HEAD for subsequent request
1630 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1631 if code == 303 and m != 'HEAD':
1632 m = 'GET'
1633 # 301 and 302 redirects are commonly turned into a GET from a POST
1634 # for subsequent requests by browsers, so we'll do the same.
1635 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1636 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1637 if code in (301, 302) and m == 'POST':
1638 m = 'GET'
1639
1640 return compat_urllib_request.Request(
1641 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1642 unverifiable=True, method=m)
1643
1644
1645 def extract_timezone(date_str):
1646 m = re.search(
1647 r'''(?x)
1648 ^.{8,}? # >=8 char non-TZ prefix, if present
1649 (?P<tz>Z| # just the UTC Z, or
1650 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1651 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1652 [ ]? # optional space
1653 (?P<sign>\+|-) # +/-
1654 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1655 $)
1656 ''', date_str)
1657 if not m:
1658 timezone = datetime.timedelta()
1659 else:
1660 date_str = date_str[:-len(m.group('tz'))]
1661 if not m.group('sign'):
1662 timezone = datetime.timedelta()
1663 else:
1664 sign = 1 if m.group('sign') == '+' else -1
1665 timezone = datetime.timedelta(
1666 hours=sign * int(m.group('hours')),
1667 minutes=sign * int(m.group('minutes')))
1668 return timezone, date_str
1669
1670
1671 def parse_iso8601(date_str, delimiter='T', timezone=None):
1672 """ Return a UNIX timestamp from the given date """
1673
1674 if date_str is None:
1675 return None
1676
1677 date_str = re.sub(r'\.[0-9]+', '', date_str)
1678
1679 if timezone is None:
1680 timezone, date_str = extract_timezone(date_str)
1681
1682 with contextlib.suppress(ValueError):
1683 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1684 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1685 return calendar.timegm(dt.timetuple())
1686
1687
1688 def date_formats(day_first=True):
1689 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1690
1691
1692 def unified_strdate(date_str, day_first=True):
1693 """Return a string with the date in the format YYYYMMDD"""
1694
1695 if date_str is None:
1696 return None
1697 upload_date = None
1698 # Replace commas
1699 date_str = date_str.replace(',', ' ')
1700 # Remove AM/PM + timezone
1701 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1702 _, date_str = extract_timezone(date_str)
1703
1704 for expression in date_formats(day_first):
1705 with contextlib.suppress(ValueError):
1706 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1707 if upload_date is None:
1708 timetuple = email.utils.parsedate_tz(date_str)
1709 if timetuple:
1710 with contextlib.suppress(ValueError):
1711 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1712 if upload_date is not None:
1713 return compat_str(upload_date)
1714
1715
1716 def unified_timestamp(date_str, day_first=True):
1717 if date_str is None:
1718 return None
1719
1720 date_str = re.sub(r'[,|]', '', date_str)
1721
1722 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1723 timezone, date_str = extract_timezone(date_str)
1724
1725 # Remove AM/PM + timezone
1726 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1727
1728 # Remove unrecognized timezones from ISO 8601 alike timestamps
1729 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1730 if m:
1731 date_str = date_str[:-len(m.group('tz'))]
1732
1733 # Python only supports microseconds, so remove nanoseconds
1734 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1735 if m:
1736 date_str = m.group(1)
1737
1738 for expression in date_formats(day_first):
1739 with contextlib.suppress(ValueError):
1740 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1741 return calendar.timegm(dt.timetuple())
1742 timetuple = email.utils.parsedate_tz(date_str)
1743 if timetuple:
1744 return calendar.timegm(timetuple) + pm_delta * 3600
1745
1746
1747 def determine_ext(url, default_ext='unknown_video'):
1748 if url is None or '.' not in url:
1749 return default_ext
1750 guess = url.partition('?')[0].rpartition('.')[2]
1751 if re.match(r'^[A-Za-z0-9]+$', guess):
1752 return guess
1753 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1754 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1755 return guess.rstrip('/')
1756 else:
1757 return default_ext
1758
1759
1760 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1761 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1762
1763
1764 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1765 R"""
1766 Return a datetime object from a string.
1767 Supported format:
1768 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1769
1770 @param format strftime format of DATE
1771 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1772 auto: round to the unit provided in date_str (if applicable).
1773 """
1774 auto_precision = False
1775 if precision == 'auto':
1776 auto_precision = True
1777 precision = 'microsecond'
1778 today = datetime_round(datetime.datetime.utcnow(), precision)
1779 if date_str in ('now', 'today'):
1780 return today
1781 if date_str == 'yesterday':
1782 return today - datetime.timedelta(days=1)
1783 match = re.match(
1784 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1785 date_str)
1786 if match is not None:
1787 start_time = datetime_from_str(match.group('start'), precision, format)
1788 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1789 unit = match.group('unit')
1790 if unit == 'month' or unit == 'year':
1791 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1792 unit = 'day'
1793 else:
1794 if unit == 'week':
1795 unit = 'day'
1796 time *= 7
1797 delta = datetime.timedelta(**{unit + 's': time})
1798 new_date = start_time + delta
1799 if auto_precision:
1800 return datetime_round(new_date, unit)
1801 return new_date
1802
1803 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1804
1805
1806 def date_from_str(date_str, format='%Y%m%d', strict=False):
1807 R"""
1808 Return a date object from a string using datetime_from_str
1809
1810 @param strict Restrict allowed patterns to "YYYYMMDD" and
1811 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1812 """
1813 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1814 raise ValueError(f'Invalid date format "{date_str}"')
1815 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1816
1817
1818 def datetime_add_months(dt, months):
1819 """Increment/Decrement a datetime object by months."""
1820 month = dt.month + months - 1
1821 year = dt.year + month // 12
1822 month = month % 12 + 1
1823 day = min(dt.day, calendar.monthrange(year, month)[1])
1824 return dt.replace(year, month, day)
1825
1826
1827 def datetime_round(dt, precision='day'):
1828 """
1829 Round a datetime object's time to a specific precision
1830 """
1831 if precision == 'microsecond':
1832 return dt
1833
1834 unit_seconds = {
1835 'day': 86400,
1836 'hour': 3600,
1837 'minute': 60,
1838 'second': 1,
1839 }
1840 roundto = lambda x, n: ((x + n / 2) // n) * n
1841 timestamp = calendar.timegm(dt.timetuple())
1842 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1843
1844
1845 def hyphenate_date(date_str):
1846 """
1847 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1848 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1849 if match is not None:
1850 return '-'.join(match.groups())
1851 else:
1852 return date_str
1853
1854
1855 class DateRange:
1856 """Represents a time interval between two dates"""
1857
1858 def __init__(self, start=None, end=None):
1859 """start and end must be strings in the format accepted by date"""
1860 if start is not None:
1861 self.start = date_from_str(start, strict=True)
1862 else:
1863 self.start = datetime.datetime.min.date()
1864 if end is not None:
1865 self.end = date_from_str(end, strict=True)
1866 else:
1867 self.end = datetime.datetime.max.date()
1868 if self.start > self.end:
1869 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1870
1871 @classmethod
1872 def day(cls, day):
1873 """Returns a range that only contains the given day"""
1874 return cls(day, day)
1875
1876 def __contains__(self, date):
1877 """Check if the date is in the range"""
1878 if not isinstance(date, datetime.date):
1879 date = date_from_str(date)
1880 return self.start <= date <= self.end
1881
1882 def __str__(self):
1883 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1884
1885
1886 def platform_name():
1887 """ Returns the platform name as a compat_str """
1888 res = platform.platform()
1889 if isinstance(res, bytes):
1890 res = res.decode(preferredencoding())
1891
1892 assert isinstance(res, compat_str)
1893 return res
1894
1895
1896 @functools.cache
1897 def get_windows_version():
1898 ''' Get Windows version. returns () if it's not running on Windows '''
1899 if compat_os_name == 'nt':
1900 return version_tuple(platform.win32_ver()[1])
1901 else:
1902 return ()
1903
1904
1905 def write_string(s, out=None, encoding=None):
1906 assert isinstance(s, str)
1907 out = out or sys.stderr
1908
1909 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1910 s = re.sub(r'([\r\n]+)', r' \1', s)
1911
1912 enc, buffer = None, out
1913 if 'b' in getattr(out, 'mode', ''):
1914 enc = encoding or preferredencoding()
1915 elif hasattr(out, 'buffer'):
1916 buffer = out.buffer
1917 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1918
1919 buffer.write(s.encode(enc, 'ignore') if enc else s)
1920 out.flush()
1921
1922
1923 def bytes_to_intlist(bs):
1924 if not bs:
1925 return []
1926 if isinstance(bs[0], int): # Python 3
1927 return list(bs)
1928 else:
1929 return [ord(c) for c in bs]
1930
1931
1932 def intlist_to_bytes(xs):
1933 if not xs:
1934 return b''
1935 return compat_struct_pack('%dB' % len(xs), *xs)
1936
1937
1938 class LockingUnsupportedError(OSError):
1939 msg = 'File locking is not supported'
1940
1941 def __init__(self):
1942 super().__init__(self.msg)
1943
1944
1945 # Cross-platform file locking
1946 if sys.platform == 'win32':
1947 import ctypes.wintypes
1948 import msvcrt
1949
1950 class OVERLAPPED(ctypes.Structure):
1951 _fields_ = [
1952 ('Internal', ctypes.wintypes.LPVOID),
1953 ('InternalHigh', ctypes.wintypes.LPVOID),
1954 ('Offset', ctypes.wintypes.DWORD),
1955 ('OffsetHigh', ctypes.wintypes.DWORD),
1956 ('hEvent', ctypes.wintypes.HANDLE),
1957 ]
1958
1959 kernel32 = ctypes.windll.kernel32
1960 LockFileEx = kernel32.LockFileEx
1961 LockFileEx.argtypes = [
1962 ctypes.wintypes.HANDLE, # hFile
1963 ctypes.wintypes.DWORD, # dwFlags
1964 ctypes.wintypes.DWORD, # dwReserved
1965 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1966 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1967 ctypes.POINTER(OVERLAPPED) # Overlapped
1968 ]
1969 LockFileEx.restype = ctypes.wintypes.BOOL
1970 UnlockFileEx = kernel32.UnlockFileEx
1971 UnlockFileEx.argtypes = [
1972 ctypes.wintypes.HANDLE, # hFile
1973 ctypes.wintypes.DWORD, # dwReserved
1974 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1975 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1976 ctypes.POINTER(OVERLAPPED) # Overlapped
1977 ]
1978 UnlockFileEx.restype = ctypes.wintypes.BOOL
1979 whole_low = 0xffffffff
1980 whole_high = 0x7fffffff
1981
1982 def _lock_file(f, exclusive, block):
1983 overlapped = OVERLAPPED()
1984 overlapped.Offset = 0
1985 overlapped.OffsetHigh = 0
1986 overlapped.hEvent = 0
1987 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1988
1989 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1990 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1991 0, whole_low, whole_high, f._lock_file_overlapped_p):
1992 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
1993
1994 def _unlock_file(f):
1995 assert f._lock_file_overlapped_p
1996 handle = msvcrt.get_osfhandle(f.fileno())
1997 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1998 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1999
2000 else:
2001 try:
2002 import fcntl
2003
2004 def _lock_file(f, exclusive, block):
2005 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2006 if not block:
2007 flags |= fcntl.LOCK_NB
2008 try:
2009 fcntl.flock(f, flags)
2010 except BlockingIOError:
2011 raise
2012 except OSError: # AOSP does not have flock()
2013 fcntl.lockf(f, flags)
2014
2015 def _unlock_file(f):
2016 try:
2017 fcntl.flock(f, fcntl.LOCK_UN)
2018 except OSError:
2019 fcntl.lockf(f, fcntl.LOCK_UN)
2020
2021 except ImportError:
2022
2023 def _lock_file(f, exclusive, block):
2024 raise LockingUnsupportedError()
2025
2026 def _unlock_file(f):
2027 raise LockingUnsupportedError()
2028
2029
2030 class locked_file:
2031 locked = False
2032
2033 def __init__(self, filename, mode, block=True, encoding=None):
2034 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2035 raise NotImplementedError(mode)
2036 self.mode, self.block = mode, block
2037
2038 writable = any(f in mode for f in 'wax+')
2039 readable = any(f in mode for f in 'r+')
2040 flags = functools.reduce(operator.ior, (
2041 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2042 getattr(os, 'O_BINARY', 0), # Windows only
2043 getattr(os, 'O_NOINHERIT', 0), # Windows only
2044 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2045 os.O_APPEND if 'a' in mode else 0,
2046 os.O_EXCL if 'x' in mode else 0,
2047 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2048 ))
2049
2050 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2051
2052 def __enter__(self):
2053 exclusive = 'r' not in self.mode
2054 try:
2055 _lock_file(self.f, exclusive, self.block)
2056 self.locked = True
2057 except OSError:
2058 self.f.close()
2059 raise
2060 if 'w' in self.mode:
2061 try:
2062 self.f.truncate()
2063 except OSError as e:
2064 if e.errno not in (
2065 errno.ESPIPE, # Illegal seek - expected for FIFO
2066 errno.EINVAL, # Invalid argument - expected for /dev/null
2067 ):
2068 raise
2069 return self
2070
2071 def unlock(self):
2072 if not self.locked:
2073 return
2074 try:
2075 _unlock_file(self.f)
2076 finally:
2077 self.locked = False
2078
2079 def __exit__(self, *_):
2080 try:
2081 self.unlock()
2082 finally:
2083 self.f.close()
2084
2085 open = __enter__
2086 close = __exit__
2087
2088 def __getattr__(self, attr):
2089 return getattr(self.f, attr)
2090
2091 def __iter__(self):
2092 return iter(self.f)
2093
2094
2095 @functools.cache
2096 def get_filesystem_encoding():
2097 encoding = sys.getfilesystemencoding()
2098 return encoding if encoding is not None else 'utf-8'
2099
2100
2101 def shell_quote(args):
2102 quoted_args = []
2103 encoding = get_filesystem_encoding()
2104 for a in args:
2105 if isinstance(a, bytes):
2106 # We may get a filename encoded with 'encodeFilename'
2107 a = a.decode(encoding)
2108 quoted_args.append(compat_shlex_quote(a))
2109 return ' '.join(quoted_args)
2110
2111
2112 def smuggle_url(url, data):
2113 """ Pass additional data in a URL for internal use. """
2114
2115 url, idata = unsmuggle_url(url, {})
2116 data.update(idata)
2117 sdata = compat_urllib_parse_urlencode(
2118 {'__youtubedl_smuggle': json.dumps(data)})
2119 return url + '#' + sdata
2120
2121
2122 def unsmuggle_url(smug_url, default=None):
2123 if '#__youtubedl_smuggle' not in smug_url:
2124 return smug_url, default
2125 url, _, sdata = smug_url.rpartition('#')
2126 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2127 data = json.loads(jsond)
2128 return url, data
2129
2130
2131 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2132 """ Formats numbers with decimal sufixes like K, M, etc """
2133 num, factor = float_or_none(num), float(factor)
2134 if num is None or num < 0:
2135 return None
2136 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2137 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2138 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2139 if factor == 1024:
2140 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2141 converted = num / (factor ** exponent)
2142 return fmt % (converted, suffix)
2143
2144
2145 def format_bytes(bytes):
2146 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2147
2148
2149 def lookup_unit_table(unit_table, s):
2150 units_re = '|'.join(re.escape(u) for u in unit_table)
2151 m = re.match(
2152 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2153 if not m:
2154 return None
2155 num_str = m.group('num').replace(',', '.')
2156 mult = unit_table[m.group('unit')]
2157 return int(float(num_str) * mult)
2158
2159
2160 def parse_filesize(s):
2161 if s is None:
2162 return None
2163
2164 # The lower-case forms are of course incorrect and unofficial,
2165 # but we support those too
2166 _UNIT_TABLE = {
2167 'B': 1,
2168 'b': 1,
2169 'bytes': 1,
2170 'KiB': 1024,
2171 'KB': 1000,
2172 'kB': 1024,
2173 'Kb': 1000,
2174 'kb': 1000,
2175 'kilobytes': 1000,
2176 'kibibytes': 1024,
2177 'MiB': 1024 ** 2,
2178 'MB': 1000 ** 2,
2179 'mB': 1024 ** 2,
2180 'Mb': 1000 ** 2,
2181 'mb': 1000 ** 2,
2182 'megabytes': 1000 ** 2,
2183 'mebibytes': 1024 ** 2,
2184 'GiB': 1024 ** 3,
2185 'GB': 1000 ** 3,
2186 'gB': 1024 ** 3,
2187 'Gb': 1000 ** 3,
2188 'gb': 1000 ** 3,
2189 'gigabytes': 1000 ** 3,
2190 'gibibytes': 1024 ** 3,
2191 'TiB': 1024 ** 4,
2192 'TB': 1000 ** 4,
2193 'tB': 1024 ** 4,
2194 'Tb': 1000 ** 4,
2195 'tb': 1000 ** 4,
2196 'terabytes': 1000 ** 4,
2197 'tebibytes': 1024 ** 4,
2198 'PiB': 1024 ** 5,
2199 'PB': 1000 ** 5,
2200 'pB': 1024 ** 5,
2201 'Pb': 1000 ** 5,
2202 'pb': 1000 ** 5,
2203 'petabytes': 1000 ** 5,
2204 'pebibytes': 1024 ** 5,
2205 'EiB': 1024 ** 6,
2206 'EB': 1000 ** 6,
2207 'eB': 1024 ** 6,
2208 'Eb': 1000 ** 6,
2209 'eb': 1000 ** 6,
2210 'exabytes': 1000 ** 6,
2211 'exbibytes': 1024 ** 6,
2212 'ZiB': 1024 ** 7,
2213 'ZB': 1000 ** 7,
2214 'zB': 1024 ** 7,
2215 'Zb': 1000 ** 7,
2216 'zb': 1000 ** 7,
2217 'zettabytes': 1000 ** 7,
2218 'zebibytes': 1024 ** 7,
2219 'YiB': 1024 ** 8,
2220 'YB': 1000 ** 8,
2221 'yB': 1024 ** 8,
2222 'Yb': 1000 ** 8,
2223 'yb': 1000 ** 8,
2224 'yottabytes': 1000 ** 8,
2225 'yobibytes': 1024 ** 8,
2226 }
2227
2228 return lookup_unit_table(_UNIT_TABLE, s)
2229
2230
2231 def parse_count(s):
2232 if s is None:
2233 return None
2234
2235 s = re.sub(r'^[^\d]+\s', '', s).strip()
2236
2237 if re.match(r'^[\d,.]+$', s):
2238 return str_to_int(s)
2239
2240 _UNIT_TABLE = {
2241 'k': 1000,
2242 'K': 1000,
2243 'm': 1000 ** 2,
2244 'M': 1000 ** 2,
2245 'kk': 1000 ** 2,
2246 'KK': 1000 ** 2,
2247 'b': 1000 ** 3,
2248 'B': 1000 ** 3,
2249 }
2250
2251 ret = lookup_unit_table(_UNIT_TABLE, s)
2252 if ret is not None:
2253 return ret
2254
2255 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2256 if mobj:
2257 return str_to_int(mobj.group(1))
2258
2259
2260 def parse_resolution(s, *, lenient=False):
2261 if s is None:
2262 return {}
2263
2264 if lenient:
2265 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2266 else:
2267 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2268 if mobj:
2269 return {
2270 'width': int(mobj.group('w')),
2271 'height': int(mobj.group('h')),
2272 }
2273
2274 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2275 if mobj:
2276 return {'height': int(mobj.group(1))}
2277
2278 mobj = re.search(r'\b([48])[kK]\b', s)
2279 if mobj:
2280 return {'height': int(mobj.group(1)) * 540}
2281
2282 return {}
2283
2284
2285 def parse_bitrate(s):
2286 if not isinstance(s, compat_str):
2287 return
2288 mobj = re.search(r'\b(\d+)\s*kbps', s)
2289 if mobj:
2290 return int(mobj.group(1))
2291
2292
2293 def month_by_name(name, lang='en'):
2294 """ Return the number of a month by (locale-independently) English name """
2295
2296 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2297
2298 try:
2299 return month_names.index(name) + 1
2300 except ValueError:
2301 return None
2302
2303
2304 def month_by_abbreviation(abbrev):
2305 """ Return the number of a month by (locale-independently) English
2306 abbreviations """
2307
2308 try:
2309 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2310 except ValueError:
2311 return None
2312
2313
2314 def fix_xml_ampersands(xml_str):
2315 """Replace all the '&' by '&amp;' in XML"""
2316 return re.sub(
2317 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2318 '&amp;',
2319 xml_str)
2320
2321
2322 def setproctitle(title):
2323 assert isinstance(title, compat_str)
2324
2325 # ctypes in Jython is not complete
2326 # http://bugs.jython.org/issue2148
2327 if sys.platform.startswith('java'):
2328 return
2329
2330 try:
2331 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2332 except OSError:
2333 return
2334 except TypeError:
2335 # LoadLibrary in Windows Python 2.7.13 only expects
2336 # a bytestring, but since unicode_literals turns
2337 # every string into a unicode string, it fails.
2338 return
2339 title_bytes = title.encode()
2340 buf = ctypes.create_string_buffer(len(title_bytes))
2341 buf.value = title_bytes
2342 try:
2343 libc.prctl(15, buf, 0, 0, 0)
2344 except AttributeError:
2345 return # Strange libc, just skip this
2346
2347
2348 def remove_start(s, start):
2349 return s[len(start):] if s is not None and s.startswith(start) else s
2350
2351
2352 def remove_end(s, end):
2353 return s[:-len(end)] if s is not None and s.endswith(end) else s
2354
2355
2356 def remove_quotes(s):
2357 if s is None or len(s) < 2:
2358 return s
2359 for quote in ('"', "'", ):
2360 if s[0] == quote and s[-1] == quote:
2361 return s[1:-1]
2362 return s
2363
2364
2365 def get_domain(url):
2366 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2367 return domain.group('domain') if domain else None
2368
2369
2370 def url_basename(url):
2371 path = compat_urlparse.urlparse(url).path
2372 return path.strip('/').split('/')[-1]
2373
2374
2375 def base_url(url):
2376 return re.match(r'https?://[^?#&]+/', url).group()
2377
2378
2379 def urljoin(base, path):
2380 if isinstance(path, bytes):
2381 path = path.decode()
2382 if not isinstance(path, compat_str) or not path:
2383 return None
2384 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2385 return path
2386 if isinstance(base, bytes):
2387 base = base.decode()
2388 if not isinstance(base, compat_str) or not re.match(
2389 r'^(?:https?:)?//', base):
2390 return None
2391 return compat_urlparse.urljoin(base, path)
2392
2393
2394 class HEADRequest(compat_urllib_request.Request):
2395 def get_method(self):
2396 return 'HEAD'
2397
2398
2399 class PUTRequest(compat_urllib_request.Request):
2400 def get_method(self):
2401 return 'PUT'
2402
2403
2404 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2405 if get_attr and v is not None:
2406 v = getattr(v, get_attr, None)
2407 try:
2408 return int(v) * invscale // scale
2409 except (ValueError, TypeError, OverflowError):
2410 return default
2411
2412
2413 def str_or_none(v, default=None):
2414 return default if v is None else compat_str(v)
2415
2416
2417 def str_to_int(int_str):
2418 """ A more relaxed version of int_or_none """
2419 if isinstance(int_str, int):
2420 return int_str
2421 elif isinstance(int_str, compat_str):
2422 int_str = re.sub(r'[,\.\+]', '', int_str)
2423 return int_or_none(int_str)
2424
2425
2426 def float_or_none(v, scale=1, invscale=1, default=None):
2427 if v is None:
2428 return default
2429 try:
2430 return float(v) * invscale / scale
2431 except (ValueError, TypeError):
2432 return default
2433
2434
2435 def bool_or_none(v, default=None):
2436 return v if isinstance(v, bool) else default
2437
2438
2439 def strip_or_none(v, default=None):
2440 return v.strip() if isinstance(v, compat_str) else default
2441
2442
2443 def url_or_none(url):
2444 if not url or not isinstance(url, compat_str):
2445 return None
2446 url = url.strip()
2447 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2448
2449
2450 def request_to_url(req):
2451 if isinstance(req, compat_urllib_request.Request):
2452 return req.get_full_url()
2453 else:
2454 return req
2455
2456
2457 def strftime_or_none(timestamp, date_format, default=None):
2458 datetime_object = None
2459 try:
2460 if isinstance(timestamp, (int, float)): # unix timestamp
2461 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2462 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2463 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2464 return datetime_object.strftime(date_format)
2465 except (ValueError, TypeError, AttributeError):
2466 return default
2467
2468
2469 def parse_duration(s):
2470 if not isinstance(s, str):
2471 return None
2472 s = s.strip()
2473 if not s:
2474 return None
2475
2476 days, hours, mins, secs, ms = [None] * 5
2477 m = re.match(r'''(?x)
2478 (?P<before_secs>
2479 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2480 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2481 (?P<ms>[.:][0-9]+)?Z?$
2482 ''', s)
2483 if m:
2484 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2485 else:
2486 m = re.match(
2487 r'''(?ix)(?:P?
2488 (?:
2489 [0-9]+\s*y(?:ears?)?,?\s*
2490 )?
2491 (?:
2492 [0-9]+\s*m(?:onths?)?,?\s*
2493 )?
2494 (?:
2495 [0-9]+\s*w(?:eeks?)?,?\s*
2496 )?
2497 (?:
2498 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2499 )?
2500 T)?
2501 (?:
2502 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2503 )?
2504 (?:
2505 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2506 )?
2507 (?:
2508 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2509 )?Z?$''', s)
2510 if m:
2511 days, hours, mins, secs, ms = m.groups()
2512 else:
2513 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2514 if m:
2515 hours, mins = m.groups()
2516 else:
2517 return None
2518
2519 if ms:
2520 ms = ms.replace(':', '.')
2521 return sum(float(part or 0) * mult for part, mult in (
2522 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2523
2524
2525 def prepend_extension(filename, ext, expected_real_ext=None):
2526 name, real_ext = os.path.splitext(filename)
2527 return (
2528 f'{name}.{ext}{real_ext}'
2529 if not expected_real_ext or real_ext[1:] == expected_real_ext
2530 else f'{filename}.{ext}')
2531
2532
2533 def replace_extension(filename, ext, expected_real_ext=None):
2534 name, real_ext = os.path.splitext(filename)
2535 return '{}.{}'.format(
2536 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2537 ext)
2538
2539
2540 def check_executable(exe, args=[]):
2541 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2542 args can be a list of arguments for a short output (like -version) """
2543 try:
2544 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2545 except OSError:
2546 return False
2547 return exe
2548
2549
2550 def _get_exe_version_output(exe, args, *, to_screen=None):
2551 if to_screen:
2552 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2553 try:
2554 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2555 # SIGTTOU if yt-dlp is run in the background.
2556 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2557 out, _ = Popen(
2558 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2559 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2560 except OSError:
2561 return False
2562 if isinstance(out, bytes): # Python 2.x
2563 out = out.decode('ascii', 'ignore')
2564 return out
2565
2566
2567 def detect_exe_version(output, version_re=None, unrecognized='present'):
2568 assert isinstance(output, compat_str)
2569 if version_re is None:
2570 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2571 m = re.search(version_re, output)
2572 if m:
2573 return m.group(1)
2574 else:
2575 return unrecognized
2576
2577
2578 def get_exe_version(exe, args=['--version'],
2579 version_re=None, unrecognized='present'):
2580 """ Returns the version of the specified executable,
2581 or False if the executable is not present """
2582 out = _get_exe_version_output(exe, args)
2583 return detect_exe_version(out, version_re, unrecognized) if out else False
2584
2585
2586 class LazyList(collections.abc.Sequence):
2587 """Lazy immutable list from an iterable
2588 Note that slices of a LazyList are lists and not LazyList"""
2589
2590 class IndexError(IndexError):
2591 pass
2592
2593 def __init__(self, iterable, *, reverse=False, _cache=None):
2594 self._iterable = iter(iterable)
2595 self._cache = [] if _cache is None else _cache
2596 self._reversed = reverse
2597
2598 def __iter__(self):
2599 if self._reversed:
2600 # We need to consume the entire iterable to iterate in reverse
2601 yield from self.exhaust()
2602 return
2603 yield from self._cache
2604 for item in self._iterable:
2605 self._cache.append(item)
2606 yield item
2607
2608 def _exhaust(self):
2609 self._cache.extend(self._iterable)
2610 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2611 return self._cache
2612
2613 def exhaust(self):
2614 """Evaluate the entire iterable"""
2615 return self._exhaust()[::-1 if self._reversed else 1]
2616
2617 @staticmethod
2618 def _reverse_index(x):
2619 return None if x is None else -(x + 1)
2620
2621 def __getitem__(self, idx):
2622 if isinstance(idx, slice):
2623 if self._reversed:
2624 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2625 start, stop, step = idx.start, idx.stop, idx.step or 1
2626 elif isinstance(idx, int):
2627 if self._reversed:
2628 idx = self._reverse_index(idx)
2629 start, stop, step = idx, idx, 0
2630 else:
2631 raise TypeError('indices must be integers or slices')
2632 if ((start or 0) < 0 or (stop or 0) < 0
2633 or (start is None and step < 0)
2634 or (stop is None and step > 0)):
2635 # We need to consume the entire iterable to be able to slice from the end
2636 # Obviously, never use this with infinite iterables
2637 self._exhaust()
2638 try:
2639 return self._cache[idx]
2640 except IndexError as e:
2641 raise self.IndexError(e) from e
2642 n = max(start or 0, stop or 0) - len(self._cache) + 1
2643 if n > 0:
2644 self._cache.extend(itertools.islice(self._iterable, n))
2645 try:
2646 return self._cache[idx]
2647 except IndexError as e:
2648 raise self.IndexError(e) from e
2649
2650 def __bool__(self):
2651 try:
2652 self[-1] if self._reversed else self[0]
2653 except self.IndexError:
2654 return False
2655 return True
2656
2657 def __len__(self):
2658 self._exhaust()
2659 return len(self._cache)
2660
2661 def __reversed__(self):
2662 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2663
2664 def __copy__(self):
2665 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2666
2667 def __repr__(self):
2668 # repr and str should mimic a list. So we exhaust the iterable
2669 return repr(self.exhaust())
2670
2671 def __str__(self):
2672 return repr(self.exhaust())
2673
2674
2675 class PagedList:
2676
2677 class IndexError(IndexError):
2678 pass
2679
2680 def __len__(self):
2681 # This is only useful for tests
2682 return len(self.getslice())
2683
2684 def __init__(self, pagefunc, pagesize, use_cache=True):
2685 self._pagefunc = pagefunc
2686 self._pagesize = pagesize
2687 self._pagecount = float('inf')
2688 self._use_cache = use_cache
2689 self._cache = {}
2690
2691 def getpage(self, pagenum):
2692 page_results = self._cache.get(pagenum)
2693 if page_results is None:
2694 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2695 if self._use_cache:
2696 self._cache[pagenum] = page_results
2697 return page_results
2698
2699 def getslice(self, start=0, end=None):
2700 return list(self._getslice(start, end))
2701
2702 def _getslice(self, start, end):
2703 raise NotImplementedError('This method must be implemented by subclasses')
2704
2705 def __getitem__(self, idx):
2706 assert self._use_cache, 'Indexing PagedList requires cache'
2707 if not isinstance(idx, int) or idx < 0:
2708 raise TypeError('indices must be non-negative integers')
2709 entries = self.getslice(idx, idx + 1)
2710 if not entries:
2711 raise self.IndexError()
2712 return entries[0]
2713
2714
2715 class OnDemandPagedList(PagedList):
2716 """Download pages until a page with less than maximum results"""
2717
2718 def _getslice(self, start, end):
2719 for pagenum in itertools.count(start // self._pagesize):
2720 firstid = pagenum * self._pagesize
2721 nextfirstid = pagenum * self._pagesize + self._pagesize
2722 if start >= nextfirstid:
2723 continue
2724
2725 startv = (
2726 start % self._pagesize
2727 if firstid <= start < nextfirstid
2728 else 0)
2729 endv = (
2730 ((end - 1) % self._pagesize) + 1
2731 if (end is not None and firstid <= end <= nextfirstid)
2732 else None)
2733
2734 try:
2735 page_results = self.getpage(pagenum)
2736 except Exception:
2737 self._pagecount = pagenum - 1
2738 raise
2739 if startv != 0 or endv is not None:
2740 page_results = page_results[startv:endv]
2741 yield from page_results
2742
2743 # A little optimization - if current page is not "full", ie. does
2744 # not contain page_size videos then we can assume that this page
2745 # is the last one - there are no more ids on further pages -
2746 # i.e. no need to query again.
2747 if len(page_results) + startv < self._pagesize:
2748 break
2749
2750 # If we got the whole page, but the next page is not interesting,
2751 # break out early as well
2752 if end == nextfirstid:
2753 break
2754
2755
2756 class InAdvancePagedList(PagedList):
2757 """PagedList with total number of pages known in advance"""
2758
2759 def __init__(self, pagefunc, pagecount, pagesize):
2760 PagedList.__init__(self, pagefunc, pagesize, True)
2761 self._pagecount = pagecount
2762
2763 def _getslice(self, start, end):
2764 start_page = start // self._pagesize
2765 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2766 skip_elems = start - start_page * self._pagesize
2767 only_more = None if end is None else end - start
2768 for pagenum in range(start_page, end_page):
2769 page_results = self.getpage(pagenum)
2770 if skip_elems:
2771 page_results = page_results[skip_elems:]
2772 skip_elems = None
2773 if only_more is not None:
2774 if len(page_results) < only_more:
2775 only_more -= len(page_results)
2776 else:
2777 yield from page_results[:only_more]
2778 break
2779 yield from page_results
2780
2781
2782 def uppercase_escape(s):
2783 unicode_escape = codecs.getdecoder('unicode_escape')
2784 return re.sub(
2785 r'\\U[0-9a-fA-F]{8}',
2786 lambda m: unicode_escape(m.group(0))[0],
2787 s)
2788
2789
2790 def lowercase_escape(s):
2791 unicode_escape = codecs.getdecoder('unicode_escape')
2792 return re.sub(
2793 r'\\u[0-9a-fA-F]{4}',
2794 lambda m: unicode_escape(m.group(0))[0],
2795 s)
2796
2797
2798 def escape_rfc3986(s):
2799 """Escape non-ASCII characters as suggested by RFC 3986"""
2800 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2801
2802
2803 def escape_url(url):
2804 """Escape URL as suggested by RFC 3986"""
2805 url_parsed = compat_urllib_parse_urlparse(url)
2806 return url_parsed._replace(
2807 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2808 path=escape_rfc3986(url_parsed.path),
2809 params=escape_rfc3986(url_parsed.params),
2810 query=escape_rfc3986(url_parsed.query),
2811 fragment=escape_rfc3986(url_parsed.fragment)
2812 ).geturl()
2813
2814
2815 def parse_qs(url):
2816 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2817
2818
2819 def read_batch_urls(batch_fd):
2820 def fixup(url):
2821 if not isinstance(url, compat_str):
2822 url = url.decode('utf-8', 'replace')
2823 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2824 for bom in BOM_UTF8:
2825 if url.startswith(bom):
2826 url = url[len(bom):]
2827 url = url.lstrip()
2828 if not url or url.startswith(('#', ';', ']')):
2829 return False
2830 # "#" cannot be stripped out since it is part of the URI
2831 # However, it can be safely stipped out if follwing a whitespace
2832 return re.split(r'\s#', url, 1)[0].rstrip()
2833
2834 with contextlib.closing(batch_fd) as fd:
2835 return [url for url in map(fixup, fd) if url]
2836
2837
2838 def urlencode_postdata(*args, **kargs):
2839 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2840
2841
2842 def update_url_query(url, query):
2843 if not query:
2844 return url
2845 parsed_url = compat_urlparse.urlparse(url)
2846 qs = compat_parse_qs(parsed_url.query)
2847 qs.update(query)
2848 return compat_urlparse.urlunparse(parsed_url._replace(
2849 query=compat_urllib_parse_urlencode(qs, True)))
2850
2851
2852 def update_Request(req, url=None, data=None, headers={}, query={}):
2853 req_headers = req.headers.copy()
2854 req_headers.update(headers)
2855 req_data = data or req.data
2856 req_url = update_url_query(url or req.get_full_url(), query)
2857 req_get_method = req.get_method()
2858 if req_get_method == 'HEAD':
2859 req_type = HEADRequest
2860 elif req_get_method == 'PUT':
2861 req_type = PUTRequest
2862 else:
2863 req_type = compat_urllib_request.Request
2864 new_req = req_type(
2865 req_url, data=req_data, headers=req_headers,
2866 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2867 if hasattr(req, 'timeout'):
2868 new_req.timeout = req.timeout
2869 return new_req
2870
2871
2872 def _multipart_encode_impl(data, boundary):
2873 content_type = 'multipart/form-data; boundary=%s' % boundary
2874
2875 out = b''
2876 for k, v in data.items():
2877 out += b'--' + boundary.encode('ascii') + b'\r\n'
2878 if isinstance(k, compat_str):
2879 k = k.encode()
2880 if isinstance(v, compat_str):
2881 v = v.encode()
2882 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2883 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2884 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2885 if boundary.encode('ascii') in content:
2886 raise ValueError('Boundary overlaps with data')
2887 out += content
2888
2889 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2890
2891 return out, content_type
2892
2893
2894 def multipart_encode(data, boundary=None):
2895 '''
2896 Encode a dict to RFC 7578-compliant form-data
2897
2898 data:
2899 A dict where keys and values can be either Unicode or bytes-like
2900 objects.
2901 boundary:
2902 If specified a Unicode object, it's used as the boundary. Otherwise
2903 a random boundary is generated.
2904
2905 Reference: https://tools.ietf.org/html/rfc7578
2906 '''
2907 has_specified_boundary = boundary is not None
2908
2909 while True:
2910 if boundary is None:
2911 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2912
2913 try:
2914 out, content_type = _multipart_encode_impl(data, boundary)
2915 break
2916 except ValueError:
2917 if has_specified_boundary:
2918 raise
2919 boundary = None
2920
2921 return out, content_type
2922
2923
2924 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2925 for val in map(d.get, variadic(key_or_keys)):
2926 if val is not None and (val or not skip_false_values):
2927 return val
2928 return default
2929
2930
2931 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2932 for f in funcs:
2933 try:
2934 val = f(*args, **kwargs)
2935 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2936 pass
2937 else:
2938 if expected_type is None or isinstance(val, expected_type):
2939 return val
2940
2941
2942 def try_get(src, getter, expected_type=None):
2943 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2944
2945
2946 def filter_dict(dct, cndn=lambda _, v: v is not None):
2947 return {k: v for k, v in dct.items() if cndn(k, v)}
2948
2949
2950 def merge_dicts(*dicts):
2951 merged = {}
2952 for a_dict in dicts:
2953 for k, v in a_dict.items():
2954 if (v is not None and k not in merged
2955 or isinstance(v, str) and merged[k] == ''):
2956 merged[k] = v
2957 return merged
2958
2959
2960 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2961 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2962
2963
2964 US_RATINGS = {
2965 'G': 0,
2966 'PG': 10,
2967 'PG-13': 13,
2968 'R': 16,
2969 'NC': 18,
2970 }
2971
2972
2973 TV_PARENTAL_GUIDELINES = {
2974 'TV-Y': 0,
2975 'TV-Y7': 7,
2976 'TV-G': 0,
2977 'TV-PG': 0,
2978 'TV-14': 14,
2979 'TV-MA': 17,
2980 }
2981
2982
2983 def parse_age_limit(s):
2984 # isinstance(False, int) is True. So type() must be used instead
2985 if type(s) is int: # noqa: E721
2986 return s if 0 <= s <= 21 else None
2987 elif not isinstance(s, str):
2988 return None
2989 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2990 if m:
2991 return int(m.group('age'))
2992 s = s.upper()
2993 if s in US_RATINGS:
2994 return US_RATINGS[s]
2995 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2996 if m:
2997 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2998 return None
2999
3000
3001 def strip_jsonp(code):
3002 return re.sub(
3003 r'''(?sx)^
3004 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3005 (?:\s*&&\s*(?P=func_name))?
3006 \s*\(\s*(?P<callback_data>.*)\);?
3007 \s*?(?://[^\n]*)*$''',
3008 r'\g<callback_data>', code)
3009
3010
3011 def js_to_json(code, vars={}):
3012 # vars is a dict of var, val pairs to substitute
3013 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3014 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3015 INTEGER_TABLE = (
3016 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3017 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3018 )
3019
3020 def fix_kv(m):
3021 v = m.group(0)
3022 if v in ('true', 'false', 'null'):
3023 return v
3024 elif v in ('undefined', 'void 0'):
3025 return 'null'
3026 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3027 return ""
3028
3029 if v[0] in ("'", '"'):
3030 v = re.sub(r'(?s)\\.|"', lambda m: {
3031 '"': '\\"',
3032 "\\'": "'",
3033 '\\\n': '',
3034 '\\x': '\\u00',
3035 }.get(m.group(0), m.group(0)), v[1:-1])
3036 else:
3037 for regex, base in INTEGER_TABLE:
3038 im = re.match(regex, v)
3039 if im:
3040 i = int(im.group(1), base)
3041 return '"%d":' % i if v.endswith(':') else '%d' % i
3042
3043 if v in vars:
3044 return vars[v]
3045
3046 return '"%s"' % v
3047
3048 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3049
3050 return re.sub(r'''(?sx)
3051 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3052 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3053 {comment}|,(?={skip}[\]}}])|
3054 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3055 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3056 [0-9]+(?={skip}:)|
3057 !+
3058 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3059
3060
3061 def qualities(quality_ids):
3062 """ Get a numeric quality value out of a list of possible values """
3063 def q(qid):
3064 try:
3065 return quality_ids.index(qid)
3066 except ValueError:
3067 return -1
3068 return q
3069
3070
3071 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3072
3073
3074 DEFAULT_OUTTMPL = {
3075 'default': '%(title)s [%(id)s].%(ext)s',
3076 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3077 }
3078 OUTTMPL_TYPES = {
3079 'chapter': None,
3080 'subtitle': None,
3081 'thumbnail': None,
3082 'description': 'description',
3083 'annotation': 'annotations.xml',
3084 'infojson': 'info.json',
3085 'link': None,
3086 'pl_video': None,
3087 'pl_thumbnail': None,
3088 'pl_description': 'description',
3089 'pl_infojson': 'info.json',
3090 }
3091
3092 # As of [1] format syntax is:
3093 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3094 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3095 STR_FORMAT_RE_TMPL = r'''(?x)
3096 (?<!%)(?P<prefix>(?:%%)*)
3097 %
3098 (?P<has_key>\((?P<key>{0})\))?
3099 (?P<format>
3100 (?P<conversion>[#0\-+ ]+)?
3101 (?P<min_width>\d+)?
3102 (?P<precision>\.\d+)?
3103 (?P<len_mod>[hlL])? # unused in python
3104 {1} # conversion type
3105 )
3106 '''
3107
3108
3109 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3110
3111
3112 def limit_length(s, length):
3113 """ Add ellipses to overly long strings """
3114 if s is None:
3115 return None
3116 ELLIPSES = '...'
3117 if len(s) > length:
3118 return s[:length - len(ELLIPSES)] + ELLIPSES
3119 return s
3120
3121
3122 def version_tuple(v):
3123 return tuple(int(e) for e in re.split(r'[-.]', v))
3124
3125
3126 def is_outdated_version(version, limit, assume_new=True):
3127 if not version:
3128 return not assume_new
3129 try:
3130 return version_tuple(version) < version_tuple(limit)
3131 except ValueError:
3132 return not assume_new
3133
3134
3135 def ytdl_is_updateable():
3136 """ Returns if yt-dlp can be updated with -U """
3137
3138 from .update import is_non_updateable
3139
3140 return not is_non_updateable()
3141
3142
3143 def args_to_str(args):
3144 # Get a short string representation for a subprocess command
3145 return ' '.join(compat_shlex_quote(a) for a in args)
3146
3147
3148 def error_to_compat_str(err):
3149 return str(err)
3150
3151
3152 def error_to_str(err):
3153 return f'{type(err).__name__}: {err}'
3154
3155
3156 def mimetype2ext(mt):
3157 if mt is None:
3158 return None
3159
3160 mt, _, params = mt.partition(';')
3161 mt = mt.strip()
3162
3163 FULL_MAP = {
3164 'audio/mp4': 'm4a',
3165 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3166 # it's the most popular one
3167 'audio/mpeg': 'mp3',
3168 'audio/x-wav': 'wav',
3169 'audio/wav': 'wav',
3170 'audio/wave': 'wav',
3171 }
3172
3173 ext = FULL_MAP.get(mt)
3174 if ext is not None:
3175 return ext
3176
3177 SUBTYPE_MAP = {
3178 '3gpp': '3gp',
3179 'smptett+xml': 'tt',
3180 'ttaf+xml': 'dfxp',
3181 'ttml+xml': 'ttml',
3182 'x-flv': 'flv',
3183 'x-mp4-fragmented': 'mp4',
3184 'x-ms-sami': 'sami',
3185 'x-ms-wmv': 'wmv',
3186 'mpegurl': 'm3u8',
3187 'x-mpegurl': 'm3u8',
3188 'vnd.apple.mpegurl': 'm3u8',
3189 'dash+xml': 'mpd',
3190 'f4m+xml': 'f4m',
3191 'hds+xml': 'f4m',
3192 'vnd.ms-sstr+xml': 'ism',
3193 'quicktime': 'mov',
3194 'mp2t': 'ts',
3195 'x-wav': 'wav',
3196 'filmstrip+json': 'fs',
3197 'svg+xml': 'svg',
3198 }
3199
3200 _, _, subtype = mt.rpartition('/')
3201 ext = SUBTYPE_MAP.get(subtype.lower())
3202 if ext is not None:
3203 return ext
3204
3205 SUFFIX_MAP = {
3206 'json': 'json',
3207 'xml': 'xml',
3208 'zip': 'zip',
3209 'gzip': 'gz',
3210 }
3211
3212 _, _, suffix = subtype.partition('+')
3213 ext = SUFFIX_MAP.get(suffix)
3214 if ext is not None:
3215 return ext
3216
3217 return subtype.replace('+', '.')
3218
3219
3220 def ext2mimetype(ext_or_url):
3221 if not ext_or_url:
3222 return None
3223 if '.' not in ext_or_url:
3224 ext_or_url = f'file.{ext_or_url}'
3225 return mimetypes.guess_type(ext_or_url)[0]
3226
3227
3228 def parse_codecs(codecs_str):
3229 # http://tools.ietf.org/html/rfc6381
3230 if not codecs_str:
3231 return {}
3232 split_codecs = list(filter(None, map(
3233 str.strip, codecs_str.strip().strip(',').split(','))))
3234 vcodec, acodec, scodec, hdr = None, None, None, None
3235 for full_codec in split_codecs:
3236 parts = full_codec.split('.')
3237 codec = parts[0].replace('0', '')
3238 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3239 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3240 if not vcodec:
3241 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3242 if codec in ('dvh1', 'dvhe'):
3243 hdr = 'DV'
3244 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3245 hdr = 'HDR10'
3246 elif full_codec.replace('0', '').startswith('vp9.2'):
3247 hdr = 'HDR10'
3248 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3249 if not acodec:
3250 acodec = full_codec
3251 elif codec in ('stpp', 'wvtt',):
3252 if not scodec:
3253 scodec = full_codec
3254 else:
3255 write_string(f'WARNING: Unknown codec {full_codec}\n')
3256 if vcodec or acodec or scodec:
3257 return {
3258 'vcodec': vcodec or 'none',
3259 'acodec': acodec or 'none',
3260 'dynamic_range': hdr,
3261 **({'scodec': scodec} if scodec is not None else {}),
3262 }
3263 elif len(split_codecs) == 2:
3264 return {
3265 'vcodec': split_codecs[0],
3266 'acodec': split_codecs[1],
3267 }
3268 return {}
3269
3270
3271 def urlhandle_detect_ext(url_handle):
3272 getheader = url_handle.headers.get
3273
3274 cd = getheader('Content-Disposition')
3275 if cd:
3276 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3277 if m:
3278 e = determine_ext(m.group('filename'), default_ext=None)
3279 if e:
3280 return e
3281
3282 return mimetype2ext(getheader('Content-Type'))
3283
3284
3285 def encode_data_uri(data, mime_type):
3286 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3287
3288
3289 def age_restricted(content_limit, age_limit):
3290 """ Returns True iff the content should be blocked """
3291
3292 if age_limit is None: # No limit set
3293 return False
3294 if content_limit is None:
3295 return False # Content available for everyone
3296 return age_limit < content_limit
3297
3298
3299 def is_html(first_bytes):
3300 """ Detect whether a file contains HTML by examining its first bytes. """
3301
3302 BOMS = [
3303 (b'\xef\xbb\xbf', 'utf-8'),
3304 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3305 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3306 (b'\xff\xfe', 'utf-16-le'),
3307 (b'\xfe\xff', 'utf-16-be'),
3308 ]
3309
3310 encoding = 'utf-8'
3311 for bom, enc in BOMS:
3312 while first_bytes.startswith(bom):
3313 encoding, first_bytes = enc, first_bytes[len(bom):]
3314
3315 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3316
3317
3318 def determine_protocol(info_dict):
3319 protocol = info_dict.get('protocol')
3320 if protocol is not None:
3321 return protocol
3322
3323 url = sanitize_url(info_dict['url'])
3324 if url.startswith('rtmp'):
3325 return 'rtmp'
3326 elif url.startswith('mms'):
3327 return 'mms'
3328 elif url.startswith('rtsp'):
3329 return 'rtsp'
3330
3331 ext = determine_ext(url)
3332 if ext == 'm3u8':
3333 return 'm3u8'
3334 elif ext == 'f4m':
3335 return 'f4m'
3336
3337 return compat_urllib_parse_urlparse(url).scheme
3338
3339
3340 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3341 """ Render a list of rows, each as a list of values.
3342 Text after a \t will be right aligned """
3343 def width(string):
3344 return len(remove_terminal_sequences(string).replace('\t', ''))
3345
3346 def get_max_lens(table):
3347 return [max(width(str(v)) for v in col) for col in zip(*table)]
3348
3349 def filter_using_list(row, filterArray):
3350 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3351
3352 max_lens = get_max_lens(data) if hide_empty else []
3353 header_row = filter_using_list(header_row, max_lens)
3354 data = [filter_using_list(row, max_lens) for row in data]
3355
3356 table = [header_row] + data
3357 max_lens = get_max_lens(table)
3358 extra_gap += 1
3359 if delim:
3360 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3361 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3362 for row in table:
3363 for pos, text in enumerate(map(str, row)):
3364 if '\t' in text:
3365 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3366 else:
3367 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3368 ret = '\n'.join(''.join(row).rstrip() for row in table)
3369 return ret
3370
3371
3372 def _match_one(filter_part, dct, incomplete):
3373 # TODO: Generalize code with YoutubeDL._build_format_filter
3374 STRING_OPERATORS = {
3375 '*=': operator.contains,
3376 '^=': lambda attr, value: attr.startswith(value),
3377 '$=': lambda attr, value: attr.endswith(value),
3378 '~=': lambda attr, value: re.search(value, attr),
3379 }
3380 COMPARISON_OPERATORS = {
3381 **STRING_OPERATORS,
3382 '<=': operator.le, # "<=" must be defined above "<"
3383 '<': operator.lt,
3384 '>=': operator.ge,
3385 '>': operator.gt,
3386 '=': operator.eq,
3387 }
3388
3389 if isinstance(incomplete, bool):
3390 is_incomplete = lambda _: incomplete
3391 else:
3392 is_incomplete = lambda k: k in incomplete
3393
3394 operator_rex = re.compile(r'''(?x)\s*
3395 (?P<key>[a-z_]+)
3396 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3397 (?:
3398 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3399 (?P<strval>.+?)
3400 )
3401 \s*$
3402 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3403 m = operator_rex.search(filter_part)
3404 if m:
3405 m = m.groupdict()
3406 unnegated_op = COMPARISON_OPERATORS[m['op']]
3407 if m['negation']:
3408 op = lambda attr, value: not unnegated_op(attr, value)
3409 else:
3410 op = unnegated_op
3411 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3412 if m['quote']:
3413 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3414 actual_value = dct.get(m['key'])
3415 numeric_comparison = None
3416 if isinstance(actual_value, (int, float)):
3417 # If the original field is a string and matching comparisonvalue is
3418 # a number we should respect the origin of the original field
3419 # and process comparison value as a string (see
3420 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3421 try:
3422 numeric_comparison = int(comparison_value)
3423 except ValueError:
3424 numeric_comparison = parse_filesize(comparison_value)
3425 if numeric_comparison is None:
3426 numeric_comparison = parse_filesize(f'{comparison_value}B')
3427 if numeric_comparison is None:
3428 numeric_comparison = parse_duration(comparison_value)
3429 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3430 raise ValueError('Operator %s only supports string values!' % m['op'])
3431 if actual_value is None:
3432 return is_incomplete(m['key']) or m['none_inclusive']
3433 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3434
3435 UNARY_OPERATORS = {
3436 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3437 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3438 }
3439 operator_rex = re.compile(r'''(?x)\s*
3440 (?P<op>%s)\s*(?P<key>[a-z_]+)
3441 \s*$
3442 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3443 m = operator_rex.search(filter_part)
3444 if m:
3445 op = UNARY_OPERATORS[m.group('op')]
3446 actual_value = dct.get(m.group('key'))
3447 if is_incomplete(m.group('key')) and actual_value is None:
3448 return True
3449 return op(actual_value)
3450
3451 raise ValueError('Invalid filter part %r' % filter_part)
3452
3453
3454 def match_str(filter_str, dct, incomplete=False):
3455 """ Filter a dictionary with a simple string syntax.
3456 @returns Whether the filter passes
3457 @param incomplete Set of keys that is expected to be missing from dct.
3458 Can be True/False to indicate all/none of the keys may be missing.
3459 All conditions on incomplete keys pass if the key is missing
3460 """
3461 return all(
3462 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3463 for filter_part in re.split(r'(?<!\\)&', filter_str))
3464
3465
3466 def match_filter_func(filters):
3467 if not filters:
3468 return None
3469 filters = set(variadic(filters))
3470
3471 interactive = '-' in filters
3472 if interactive:
3473 filters.remove('-')
3474
3475 def _match_func(info_dict, incomplete=False):
3476 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3477 return NO_DEFAULT if interactive and not incomplete else None
3478 else:
3479 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3480 filter_str = ') | ('.join(map(str.strip, filters))
3481 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3482 return _match_func
3483
3484
3485 def parse_dfxp_time_expr(time_expr):
3486 if not time_expr:
3487 return
3488
3489 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3490 if mobj:
3491 return float(mobj.group('time_offset'))
3492
3493 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3494 if mobj:
3495 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3496
3497
3498 def srt_subtitles_timecode(seconds):
3499 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3500
3501
3502 def ass_subtitles_timecode(seconds):
3503 time = timetuple_from_msec(seconds * 1000)
3504 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3505
3506
3507 def dfxp2srt(dfxp_data):
3508 '''
3509 @param dfxp_data A bytes-like object containing DFXP data
3510 @returns A unicode object containing converted SRT data
3511 '''
3512 LEGACY_NAMESPACES = (
3513 (b'http://www.w3.org/ns/ttml', [
3514 b'http://www.w3.org/2004/11/ttaf1',
3515 b'http://www.w3.org/2006/04/ttaf1',
3516 b'http://www.w3.org/2006/10/ttaf1',
3517 ]),
3518 (b'http://www.w3.org/ns/ttml#styling', [
3519 b'http://www.w3.org/ns/ttml#style',
3520 ]),
3521 )
3522
3523 SUPPORTED_STYLING = [
3524 'color',
3525 'fontFamily',
3526 'fontSize',
3527 'fontStyle',
3528 'fontWeight',
3529 'textDecoration'
3530 ]
3531
3532 _x = functools.partial(xpath_with_ns, ns_map={
3533 'xml': 'http://www.w3.org/XML/1998/namespace',
3534 'ttml': 'http://www.w3.org/ns/ttml',
3535 'tts': 'http://www.w3.org/ns/ttml#styling',
3536 })
3537
3538 styles = {}
3539 default_style = {}
3540
3541 class TTMLPElementParser:
3542 _out = ''
3543 _unclosed_elements = []
3544 _applied_styles = []
3545
3546 def start(self, tag, attrib):
3547 if tag in (_x('ttml:br'), 'br'):
3548 self._out += '\n'
3549 else:
3550 unclosed_elements = []
3551 style = {}
3552 element_style_id = attrib.get('style')
3553 if default_style:
3554 style.update(default_style)
3555 if element_style_id:
3556 style.update(styles.get(element_style_id, {}))
3557 for prop in SUPPORTED_STYLING:
3558 prop_val = attrib.get(_x('tts:' + prop))
3559 if prop_val:
3560 style[prop] = prop_val
3561 if style:
3562 font = ''
3563 for k, v in sorted(style.items()):
3564 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3565 continue
3566 if k == 'color':
3567 font += ' color="%s"' % v
3568 elif k == 'fontSize':
3569 font += ' size="%s"' % v
3570 elif k == 'fontFamily':
3571 font += ' face="%s"' % v
3572 elif k == 'fontWeight' and v == 'bold':
3573 self._out += '<b>'
3574 unclosed_elements.append('b')
3575 elif k == 'fontStyle' and v == 'italic':
3576 self._out += '<i>'
3577 unclosed_elements.append('i')
3578 elif k == 'textDecoration' and v == 'underline':
3579 self._out += '<u>'
3580 unclosed_elements.append('u')
3581 if font:
3582 self._out += '<font' + font + '>'
3583 unclosed_elements.append('font')
3584 applied_style = {}
3585 if self._applied_styles:
3586 applied_style.update(self._applied_styles[-1])
3587 applied_style.update(style)
3588 self._applied_styles.append(applied_style)
3589 self._unclosed_elements.append(unclosed_elements)
3590
3591 def end(self, tag):
3592 if tag not in (_x('ttml:br'), 'br'):
3593 unclosed_elements = self._unclosed_elements.pop()
3594 for element in reversed(unclosed_elements):
3595 self._out += '</%s>' % element
3596 if unclosed_elements and self._applied_styles:
3597 self._applied_styles.pop()
3598
3599 def data(self, data):
3600 self._out += data
3601
3602 def close(self):
3603 return self._out.strip()
3604
3605 def parse_node(node):
3606 target = TTMLPElementParser()
3607 parser = xml.etree.ElementTree.XMLParser(target=target)
3608 parser.feed(xml.etree.ElementTree.tostring(node))
3609 return parser.close()
3610
3611 for k, v in LEGACY_NAMESPACES:
3612 for ns in v:
3613 dfxp_data = dfxp_data.replace(ns, k)
3614
3615 dfxp = compat_etree_fromstring(dfxp_data)
3616 out = []
3617 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3618
3619 if not paras:
3620 raise ValueError('Invalid dfxp/TTML subtitle')
3621
3622 repeat = False
3623 while True:
3624 for style in dfxp.findall(_x('.//ttml:style')):
3625 style_id = style.get('id') or style.get(_x('xml:id'))
3626 if not style_id:
3627 continue
3628 parent_style_id = style.get('style')
3629 if parent_style_id:
3630 if parent_style_id not in styles:
3631 repeat = True
3632 continue
3633 styles[style_id] = styles[parent_style_id].copy()
3634 for prop in SUPPORTED_STYLING:
3635 prop_val = style.get(_x('tts:' + prop))
3636 if prop_val:
3637 styles.setdefault(style_id, {})[prop] = prop_val
3638 if repeat:
3639 repeat = False
3640 else:
3641 break
3642
3643 for p in ('body', 'div'):
3644 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3645 if ele is None:
3646 continue
3647 style = styles.get(ele.get('style'))
3648 if not style:
3649 continue
3650 default_style.update(style)
3651
3652 for para, index in zip(paras, itertools.count(1)):
3653 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3654 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3655 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3656 if begin_time is None:
3657 continue
3658 if not end_time:
3659 if not dur:
3660 continue
3661 end_time = begin_time + dur
3662 out.append('%d\n%s --> %s\n%s\n\n' % (
3663 index,
3664 srt_subtitles_timecode(begin_time),
3665 srt_subtitles_timecode(end_time),
3666 parse_node(para)))
3667
3668 return ''.join(out)
3669
3670
3671 def cli_option(params, command_option, param, separator=None):
3672 param = params.get(param)
3673 return ([] if param is None
3674 else [command_option, str(param)] if separator is None
3675 else [f'{command_option}{separator}{param}'])
3676
3677
3678 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3679 param = params.get(param)
3680 assert param in (True, False, None)
3681 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3682
3683
3684 def cli_valueless_option(params, command_option, param, expected_value=True):
3685 return [command_option] if params.get(param) == expected_value else []
3686
3687
3688 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3689 if isinstance(argdict, (list, tuple)): # for backward compatibility
3690 if use_compat:
3691 return argdict
3692 else:
3693 argdict = None
3694 if argdict is None:
3695 return default
3696 assert isinstance(argdict, dict)
3697
3698 assert isinstance(keys, (list, tuple))
3699 for key_list in keys:
3700 arg_list = list(filter(
3701 lambda x: x is not None,
3702 [argdict.get(key.lower()) for key in variadic(key_list)]))
3703 if arg_list:
3704 return [arg for args in arg_list for arg in args]
3705 return default
3706
3707
3708 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3709 main_key, exe = main_key.lower(), exe.lower()
3710 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3711 keys = [f'{root_key}{k}' for k in (keys or [''])]
3712 if root_key in keys:
3713 if main_key != exe:
3714 keys.append((main_key, exe))
3715 keys.append('default')
3716 else:
3717 use_compat = False
3718 return cli_configuration_args(argdict, keys, default, use_compat)
3719
3720
3721 class ISO639Utils:
3722 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3723 _lang_map = {
3724 'aa': 'aar',
3725 'ab': 'abk',
3726 'ae': 'ave',
3727 'af': 'afr',
3728 'ak': 'aka',
3729 'am': 'amh',
3730 'an': 'arg',
3731 'ar': 'ara',
3732 'as': 'asm',
3733 'av': 'ava',
3734 'ay': 'aym',
3735 'az': 'aze',
3736 'ba': 'bak',
3737 'be': 'bel',
3738 'bg': 'bul',
3739 'bh': 'bih',
3740 'bi': 'bis',
3741 'bm': 'bam',
3742 'bn': 'ben',
3743 'bo': 'bod',
3744 'br': 'bre',
3745 'bs': 'bos',
3746 'ca': 'cat',
3747 'ce': 'che',
3748 'ch': 'cha',
3749 'co': 'cos',
3750 'cr': 'cre',
3751 'cs': 'ces',
3752 'cu': 'chu',
3753 'cv': 'chv',
3754 'cy': 'cym',
3755 'da': 'dan',
3756 'de': 'deu',
3757 'dv': 'div',
3758 'dz': 'dzo',
3759 'ee': 'ewe',
3760 'el': 'ell',
3761 'en': 'eng',
3762 'eo': 'epo',
3763 'es': 'spa',
3764 'et': 'est',
3765 'eu': 'eus',
3766 'fa': 'fas',
3767 'ff': 'ful',
3768 'fi': 'fin',
3769 'fj': 'fij',
3770 'fo': 'fao',
3771 'fr': 'fra',
3772 'fy': 'fry',
3773 'ga': 'gle',
3774 'gd': 'gla',
3775 'gl': 'glg',
3776 'gn': 'grn',
3777 'gu': 'guj',
3778 'gv': 'glv',
3779 'ha': 'hau',
3780 'he': 'heb',
3781 'iw': 'heb', # Replaced by he in 1989 revision
3782 'hi': 'hin',
3783 'ho': 'hmo',
3784 'hr': 'hrv',
3785 'ht': 'hat',
3786 'hu': 'hun',
3787 'hy': 'hye',
3788 'hz': 'her',
3789 'ia': 'ina',
3790 'id': 'ind',
3791 'in': 'ind', # Replaced by id in 1989 revision
3792 'ie': 'ile',
3793 'ig': 'ibo',
3794 'ii': 'iii',
3795 'ik': 'ipk',
3796 'io': 'ido',
3797 'is': 'isl',
3798 'it': 'ita',
3799 'iu': 'iku',
3800 'ja': 'jpn',
3801 'jv': 'jav',
3802 'ka': 'kat',
3803 'kg': 'kon',
3804 'ki': 'kik',
3805 'kj': 'kua',
3806 'kk': 'kaz',
3807 'kl': 'kal',
3808 'km': 'khm',
3809 'kn': 'kan',
3810 'ko': 'kor',
3811 'kr': 'kau',
3812 'ks': 'kas',
3813 'ku': 'kur',
3814 'kv': 'kom',
3815 'kw': 'cor',
3816 'ky': 'kir',
3817 'la': 'lat',
3818 'lb': 'ltz',
3819 'lg': 'lug',
3820 'li': 'lim',
3821 'ln': 'lin',
3822 'lo': 'lao',
3823 'lt': 'lit',
3824 'lu': 'lub',
3825 'lv': 'lav',
3826 'mg': 'mlg',
3827 'mh': 'mah',
3828 'mi': 'mri',
3829 'mk': 'mkd',
3830 'ml': 'mal',
3831 'mn': 'mon',
3832 'mr': 'mar',
3833 'ms': 'msa',
3834 'mt': 'mlt',
3835 'my': 'mya',
3836 'na': 'nau',
3837 'nb': 'nob',
3838 'nd': 'nde',
3839 'ne': 'nep',
3840 'ng': 'ndo',
3841 'nl': 'nld',
3842 'nn': 'nno',
3843 'no': 'nor',
3844 'nr': 'nbl',
3845 'nv': 'nav',
3846 'ny': 'nya',
3847 'oc': 'oci',
3848 'oj': 'oji',
3849 'om': 'orm',
3850 'or': 'ori',
3851 'os': 'oss',
3852 'pa': 'pan',
3853 'pi': 'pli',
3854 'pl': 'pol',
3855 'ps': 'pus',
3856 'pt': 'por',
3857 'qu': 'que',
3858 'rm': 'roh',
3859 'rn': 'run',
3860 'ro': 'ron',
3861 'ru': 'rus',
3862 'rw': 'kin',
3863 'sa': 'san',
3864 'sc': 'srd',
3865 'sd': 'snd',
3866 'se': 'sme',
3867 'sg': 'sag',
3868 'si': 'sin',
3869 'sk': 'slk',
3870 'sl': 'slv',
3871 'sm': 'smo',
3872 'sn': 'sna',
3873 'so': 'som',
3874 'sq': 'sqi',
3875 'sr': 'srp',
3876 'ss': 'ssw',
3877 'st': 'sot',
3878 'su': 'sun',
3879 'sv': 'swe',
3880 'sw': 'swa',
3881 'ta': 'tam',
3882 'te': 'tel',
3883 'tg': 'tgk',
3884 'th': 'tha',
3885 'ti': 'tir',
3886 'tk': 'tuk',
3887 'tl': 'tgl',
3888 'tn': 'tsn',
3889 'to': 'ton',
3890 'tr': 'tur',
3891 'ts': 'tso',
3892 'tt': 'tat',
3893 'tw': 'twi',
3894 'ty': 'tah',
3895 'ug': 'uig',
3896 'uk': 'ukr',
3897 'ur': 'urd',
3898 'uz': 'uzb',
3899 've': 'ven',
3900 'vi': 'vie',
3901 'vo': 'vol',
3902 'wa': 'wln',
3903 'wo': 'wol',
3904 'xh': 'xho',
3905 'yi': 'yid',
3906 'ji': 'yid', # Replaced by yi in 1989 revision
3907 'yo': 'yor',
3908 'za': 'zha',
3909 'zh': 'zho',
3910 'zu': 'zul',
3911 }
3912
3913 @classmethod
3914 def short2long(cls, code):
3915 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3916 return cls._lang_map.get(code[:2])
3917
3918 @classmethod
3919 def long2short(cls, code):
3920 """Convert language code from ISO 639-2/T to ISO 639-1"""
3921 for short_name, long_name in cls._lang_map.items():
3922 if long_name == code:
3923 return short_name
3924
3925
3926 class ISO3166Utils:
3927 # From http://data.okfn.org/data/core/country-list
3928 _country_map = {
3929 'AF': 'Afghanistan',
3930 'AX': 'Åland Islands',
3931 'AL': 'Albania',
3932 'DZ': 'Algeria',
3933 'AS': 'American Samoa',
3934 'AD': 'Andorra',
3935 'AO': 'Angola',
3936 'AI': 'Anguilla',
3937 'AQ': 'Antarctica',
3938 'AG': 'Antigua and Barbuda',
3939 'AR': 'Argentina',
3940 'AM': 'Armenia',
3941 'AW': 'Aruba',
3942 'AU': 'Australia',
3943 'AT': 'Austria',
3944 'AZ': 'Azerbaijan',
3945 'BS': 'Bahamas',
3946 'BH': 'Bahrain',
3947 'BD': 'Bangladesh',
3948 'BB': 'Barbados',
3949 'BY': 'Belarus',
3950 'BE': 'Belgium',
3951 'BZ': 'Belize',
3952 'BJ': 'Benin',
3953 'BM': 'Bermuda',
3954 'BT': 'Bhutan',
3955 'BO': 'Bolivia, Plurinational State of',
3956 'BQ': 'Bonaire, Sint Eustatius and Saba',
3957 'BA': 'Bosnia and Herzegovina',
3958 'BW': 'Botswana',
3959 'BV': 'Bouvet Island',
3960 'BR': 'Brazil',
3961 'IO': 'British Indian Ocean Territory',
3962 'BN': 'Brunei Darussalam',
3963 'BG': 'Bulgaria',
3964 'BF': 'Burkina Faso',
3965 'BI': 'Burundi',
3966 'KH': 'Cambodia',
3967 'CM': 'Cameroon',
3968 'CA': 'Canada',
3969 'CV': 'Cape Verde',
3970 'KY': 'Cayman Islands',
3971 'CF': 'Central African Republic',
3972 'TD': 'Chad',
3973 'CL': 'Chile',
3974 'CN': 'China',
3975 'CX': 'Christmas Island',
3976 'CC': 'Cocos (Keeling) Islands',
3977 'CO': 'Colombia',
3978 'KM': 'Comoros',
3979 'CG': 'Congo',
3980 'CD': 'Congo, the Democratic Republic of the',
3981 'CK': 'Cook Islands',
3982 'CR': 'Costa Rica',
3983 'CI': 'Côte d\'Ivoire',
3984 'HR': 'Croatia',
3985 'CU': 'Cuba',
3986 'CW': 'Curaçao',
3987 'CY': 'Cyprus',
3988 'CZ': 'Czech Republic',
3989 'DK': 'Denmark',
3990 'DJ': 'Djibouti',
3991 'DM': 'Dominica',
3992 'DO': 'Dominican Republic',
3993 'EC': 'Ecuador',
3994 'EG': 'Egypt',
3995 'SV': 'El Salvador',
3996 'GQ': 'Equatorial Guinea',
3997 'ER': 'Eritrea',
3998 'EE': 'Estonia',
3999 'ET': 'Ethiopia',
4000 'FK': 'Falkland Islands (Malvinas)',
4001 'FO': 'Faroe Islands',
4002 'FJ': 'Fiji',
4003 'FI': 'Finland',
4004 'FR': 'France',
4005 'GF': 'French Guiana',
4006 'PF': 'French Polynesia',
4007 'TF': 'French Southern Territories',
4008 'GA': 'Gabon',
4009 'GM': 'Gambia',
4010 'GE': 'Georgia',
4011 'DE': 'Germany',
4012 'GH': 'Ghana',
4013 'GI': 'Gibraltar',
4014 'GR': 'Greece',
4015 'GL': 'Greenland',
4016 'GD': 'Grenada',
4017 'GP': 'Guadeloupe',
4018 'GU': 'Guam',
4019 'GT': 'Guatemala',
4020 'GG': 'Guernsey',
4021 'GN': 'Guinea',
4022 'GW': 'Guinea-Bissau',
4023 'GY': 'Guyana',
4024 'HT': 'Haiti',
4025 'HM': 'Heard Island and McDonald Islands',
4026 'VA': 'Holy See (Vatican City State)',
4027 'HN': 'Honduras',
4028 'HK': 'Hong Kong',
4029 'HU': 'Hungary',
4030 'IS': 'Iceland',
4031 'IN': 'India',
4032 'ID': 'Indonesia',
4033 'IR': 'Iran, Islamic Republic of',
4034 'IQ': 'Iraq',
4035 'IE': 'Ireland',
4036 'IM': 'Isle of Man',
4037 'IL': 'Israel',
4038 'IT': 'Italy',
4039 'JM': 'Jamaica',
4040 'JP': 'Japan',
4041 'JE': 'Jersey',
4042 'JO': 'Jordan',
4043 'KZ': 'Kazakhstan',
4044 'KE': 'Kenya',
4045 'KI': 'Kiribati',
4046 'KP': 'Korea, Democratic People\'s Republic of',
4047 'KR': 'Korea, Republic of',
4048 'KW': 'Kuwait',
4049 'KG': 'Kyrgyzstan',
4050 'LA': 'Lao People\'s Democratic Republic',
4051 'LV': 'Latvia',
4052 'LB': 'Lebanon',
4053 'LS': 'Lesotho',
4054 'LR': 'Liberia',
4055 'LY': 'Libya',
4056 'LI': 'Liechtenstein',
4057 'LT': 'Lithuania',
4058 'LU': 'Luxembourg',
4059 'MO': 'Macao',
4060 'MK': 'Macedonia, the Former Yugoslav Republic of',
4061 'MG': 'Madagascar',
4062 'MW': 'Malawi',
4063 'MY': 'Malaysia',
4064 'MV': 'Maldives',
4065 'ML': 'Mali',
4066 'MT': 'Malta',
4067 'MH': 'Marshall Islands',
4068 'MQ': 'Martinique',
4069 'MR': 'Mauritania',
4070 'MU': 'Mauritius',
4071 'YT': 'Mayotte',
4072 'MX': 'Mexico',
4073 'FM': 'Micronesia, Federated States of',
4074 'MD': 'Moldova, Republic of',
4075 'MC': 'Monaco',
4076 'MN': 'Mongolia',
4077 'ME': 'Montenegro',
4078 'MS': 'Montserrat',
4079 'MA': 'Morocco',
4080 'MZ': 'Mozambique',
4081 'MM': 'Myanmar',
4082 'NA': 'Namibia',
4083 'NR': 'Nauru',
4084 'NP': 'Nepal',
4085 'NL': 'Netherlands',
4086 'NC': 'New Caledonia',
4087 'NZ': 'New Zealand',
4088 'NI': 'Nicaragua',
4089 'NE': 'Niger',
4090 'NG': 'Nigeria',
4091 'NU': 'Niue',
4092 'NF': 'Norfolk Island',
4093 'MP': 'Northern Mariana Islands',
4094 'NO': 'Norway',
4095 'OM': 'Oman',
4096 'PK': 'Pakistan',
4097 'PW': 'Palau',
4098 'PS': 'Palestine, State of',
4099 'PA': 'Panama',
4100 'PG': 'Papua New Guinea',
4101 'PY': 'Paraguay',
4102 'PE': 'Peru',
4103 'PH': 'Philippines',
4104 'PN': 'Pitcairn',
4105 'PL': 'Poland',
4106 'PT': 'Portugal',
4107 'PR': 'Puerto Rico',
4108 'QA': 'Qatar',
4109 'RE': 'Réunion',
4110 'RO': 'Romania',
4111 'RU': 'Russian Federation',
4112 'RW': 'Rwanda',
4113 'BL': 'Saint Barthélemy',
4114 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4115 'KN': 'Saint Kitts and Nevis',
4116 'LC': 'Saint Lucia',
4117 'MF': 'Saint Martin (French part)',
4118 'PM': 'Saint Pierre and Miquelon',
4119 'VC': 'Saint Vincent and the Grenadines',
4120 'WS': 'Samoa',
4121 'SM': 'San Marino',
4122 'ST': 'Sao Tome and Principe',
4123 'SA': 'Saudi Arabia',
4124 'SN': 'Senegal',
4125 'RS': 'Serbia',
4126 'SC': 'Seychelles',
4127 'SL': 'Sierra Leone',
4128 'SG': 'Singapore',
4129 'SX': 'Sint Maarten (Dutch part)',
4130 'SK': 'Slovakia',
4131 'SI': 'Slovenia',
4132 'SB': 'Solomon Islands',
4133 'SO': 'Somalia',
4134 'ZA': 'South Africa',
4135 'GS': 'South Georgia and the South Sandwich Islands',
4136 'SS': 'South Sudan',
4137 'ES': 'Spain',
4138 'LK': 'Sri Lanka',
4139 'SD': 'Sudan',
4140 'SR': 'Suriname',
4141 'SJ': 'Svalbard and Jan Mayen',
4142 'SZ': 'Swaziland',
4143 'SE': 'Sweden',
4144 'CH': 'Switzerland',
4145 'SY': 'Syrian Arab Republic',
4146 'TW': 'Taiwan, Province of China',
4147 'TJ': 'Tajikistan',
4148 'TZ': 'Tanzania, United Republic of',
4149 'TH': 'Thailand',
4150 'TL': 'Timor-Leste',
4151 'TG': 'Togo',
4152 'TK': 'Tokelau',
4153 'TO': 'Tonga',
4154 'TT': 'Trinidad and Tobago',
4155 'TN': 'Tunisia',
4156 'TR': 'Turkey',
4157 'TM': 'Turkmenistan',
4158 'TC': 'Turks and Caicos Islands',
4159 'TV': 'Tuvalu',
4160 'UG': 'Uganda',
4161 'UA': 'Ukraine',
4162 'AE': 'United Arab Emirates',
4163 'GB': 'United Kingdom',
4164 'US': 'United States',
4165 'UM': 'United States Minor Outlying Islands',
4166 'UY': 'Uruguay',
4167 'UZ': 'Uzbekistan',
4168 'VU': 'Vanuatu',
4169 'VE': 'Venezuela, Bolivarian Republic of',
4170 'VN': 'Viet Nam',
4171 'VG': 'Virgin Islands, British',
4172 'VI': 'Virgin Islands, U.S.',
4173 'WF': 'Wallis and Futuna',
4174 'EH': 'Western Sahara',
4175 'YE': 'Yemen',
4176 'ZM': 'Zambia',
4177 'ZW': 'Zimbabwe',
4178 # Not ISO 3166 codes, but used for IP blocks
4179 'AP': 'Asia/Pacific Region',
4180 'EU': 'Europe',
4181 }
4182
4183 @classmethod
4184 def short2full(cls, code):
4185 """Convert an ISO 3166-2 country code to the corresponding full name"""
4186 return cls._country_map.get(code.upper())
4187
4188
4189 class GeoUtils:
4190 # Major IPv4 address blocks per country
4191 _country_ip_map = {
4192 'AD': '46.172.224.0/19',
4193 'AE': '94.200.0.0/13',
4194 'AF': '149.54.0.0/17',
4195 'AG': '209.59.64.0/18',
4196 'AI': '204.14.248.0/21',
4197 'AL': '46.99.0.0/16',
4198 'AM': '46.70.0.0/15',
4199 'AO': '105.168.0.0/13',
4200 'AP': '182.50.184.0/21',
4201 'AQ': '23.154.160.0/24',
4202 'AR': '181.0.0.0/12',
4203 'AS': '202.70.112.0/20',
4204 'AT': '77.116.0.0/14',
4205 'AU': '1.128.0.0/11',
4206 'AW': '181.41.0.0/18',
4207 'AX': '185.217.4.0/22',
4208 'AZ': '5.197.0.0/16',
4209 'BA': '31.176.128.0/17',
4210 'BB': '65.48.128.0/17',
4211 'BD': '114.130.0.0/16',
4212 'BE': '57.0.0.0/8',
4213 'BF': '102.178.0.0/15',
4214 'BG': '95.42.0.0/15',
4215 'BH': '37.131.0.0/17',
4216 'BI': '154.117.192.0/18',
4217 'BJ': '137.255.0.0/16',
4218 'BL': '185.212.72.0/23',
4219 'BM': '196.12.64.0/18',
4220 'BN': '156.31.0.0/16',
4221 'BO': '161.56.0.0/16',
4222 'BQ': '161.0.80.0/20',
4223 'BR': '191.128.0.0/12',
4224 'BS': '24.51.64.0/18',
4225 'BT': '119.2.96.0/19',
4226 'BW': '168.167.0.0/16',
4227 'BY': '178.120.0.0/13',
4228 'BZ': '179.42.192.0/18',
4229 'CA': '99.224.0.0/11',
4230 'CD': '41.243.0.0/16',
4231 'CF': '197.242.176.0/21',
4232 'CG': '160.113.0.0/16',
4233 'CH': '85.0.0.0/13',
4234 'CI': '102.136.0.0/14',
4235 'CK': '202.65.32.0/19',
4236 'CL': '152.172.0.0/14',
4237 'CM': '102.244.0.0/14',
4238 'CN': '36.128.0.0/10',
4239 'CO': '181.240.0.0/12',
4240 'CR': '201.192.0.0/12',
4241 'CU': '152.206.0.0/15',
4242 'CV': '165.90.96.0/19',
4243 'CW': '190.88.128.0/17',
4244 'CY': '31.153.0.0/16',
4245 'CZ': '88.100.0.0/14',
4246 'DE': '53.0.0.0/8',
4247 'DJ': '197.241.0.0/17',
4248 'DK': '87.48.0.0/12',
4249 'DM': '192.243.48.0/20',
4250 'DO': '152.166.0.0/15',
4251 'DZ': '41.96.0.0/12',
4252 'EC': '186.68.0.0/15',
4253 'EE': '90.190.0.0/15',
4254 'EG': '156.160.0.0/11',
4255 'ER': '196.200.96.0/20',
4256 'ES': '88.0.0.0/11',
4257 'ET': '196.188.0.0/14',
4258 'EU': '2.16.0.0/13',
4259 'FI': '91.152.0.0/13',
4260 'FJ': '144.120.0.0/16',
4261 'FK': '80.73.208.0/21',
4262 'FM': '119.252.112.0/20',
4263 'FO': '88.85.32.0/19',
4264 'FR': '90.0.0.0/9',
4265 'GA': '41.158.0.0/15',
4266 'GB': '25.0.0.0/8',
4267 'GD': '74.122.88.0/21',
4268 'GE': '31.146.0.0/16',
4269 'GF': '161.22.64.0/18',
4270 'GG': '62.68.160.0/19',
4271 'GH': '154.160.0.0/12',
4272 'GI': '95.164.0.0/16',
4273 'GL': '88.83.0.0/19',
4274 'GM': '160.182.0.0/15',
4275 'GN': '197.149.192.0/18',
4276 'GP': '104.250.0.0/19',
4277 'GQ': '105.235.224.0/20',
4278 'GR': '94.64.0.0/13',
4279 'GT': '168.234.0.0/16',
4280 'GU': '168.123.0.0/16',
4281 'GW': '197.214.80.0/20',
4282 'GY': '181.41.64.0/18',
4283 'HK': '113.252.0.0/14',
4284 'HN': '181.210.0.0/16',
4285 'HR': '93.136.0.0/13',
4286 'HT': '148.102.128.0/17',
4287 'HU': '84.0.0.0/14',
4288 'ID': '39.192.0.0/10',
4289 'IE': '87.32.0.0/12',
4290 'IL': '79.176.0.0/13',
4291 'IM': '5.62.80.0/20',
4292 'IN': '117.192.0.0/10',
4293 'IO': '203.83.48.0/21',
4294 'IQ': '37.236.0.0/14',
4295 'IR': '2.176.0.0/12',
4296 'IS': '82.221.0.0/16',
4297 'IT': '79.0.0.0/10',
4298 'JE': '87.244.64.0/18',
4299 'JM': '72.27.0.0/17',
4300 'JO': '176.29.0.0/16',
4301 'JP': '133.0.0.0/8',
4302 'KE': '105.48.0.0/12',
4303 'KG': '158.181.128.0/17',
4304 'KH': '36.37.128.0/17',
4305 'KI': '103.25.140.0/22',
4306 'KM': '197.255.224.0/20',
4307 'KN': '198.167.192.0/19',
4308 'KP': '175.45.176.0/22',
4309 'KR': '175.192.0.0/10',
4310 'KW': '37.36.0.0/14',
4311 'KY': '64.96.0.0/15',
4312 'KZ': '2.72.0.0/13',
4313 'LA': '115.84.64.0/18',
4314 'LB': '178.135.0.0/16',
4315 'LC': '24.92.144.0/20',
4316 'LI': '82.117.0.0/19',
4317 'LK': '112.134.0.0/15',
4318 'LR': '102.183.0.0/16',
4319 'LS': '129.232.0.0/17',
4320 'LT': '78.56.0.0/13',
4321 'LU': '188.42.0.0/16',
4322 'LV': '46.109.0.0/16',
4323 'LY': '41.252.0.0/14',
4324 'MA': '105.128.0.0/11',
4325 'MC': '88.209.64.0/18',
4326 'MD': '37.246.0.0/16',
4327 'ME': '178.175.0.0/17',
4328 'MF': '74.112.232.0/21',
4329 'MG': '154.126.0.0/17',
4330 'MH': '117.103.88.0/21',
4331 'MK': '77.28.0.0/15',
4332 'ML': '154.118.128.0/18',
4333 'MM': '37.111.0.0/17',
4334 'MN': '49.0.128.0/17',
4335 'MO': '60.246.0.0/16',
4336 'MP': '202.88.64.0/20',
4337 'MQ': '109.203.224.0/19',
4338 'MR': '41.188.64.0/18',
4339 'MS': '208.90.112.0/22',
4340 'MT': '46.11.0.0/16',
4341 'MU': '105.16.0.0/12',
4342 'MV': '27.114.128.0/18',
4343 'MW': '102.70.0.0/15',
4344 'MX': '187.192.0.0/11',
4345 'MY': '175.136.0.0/13',
4346 'MZ': '197.218.0.0/15',
4347 'NA': '41.182.0.0/16',
4348 'NC': '101.101.0.0/18',
4349 'NE': '197.214.0.0/18',
4350 'NF': '203.17.240.0/22',
4351 'NG': '105.112.0.0/12',
4352 'NI': '186.76.0.0/15',
4353 'NL': '145.96.0.0/11',
4354 'NO': '84.208.0.0/13',
4355 'NP': '36.252.0.0/15',
4356 'NR': '203.98.224.0/19',
4357 'NU': '49.156.48.0/22',
4358 'NZ': '49.224.0.0/14',
4359 'OM': '5.36.0.0/15',
4360 'PA': '186.72.0.0/15',
4361 'PE': '186.160.0.0/14',
4362 'PF': '123.50.64.0/18',
4363 'PG': '124.240.192.0/19',
4364 'PH': '49.144.0.0/13',
4365 'PK': '39.32.0.0/11',
4366 'PL': '83.0.0.0/11',
4367 'PM': '70.36.0.0/20',
4368 'PR': '66.50.0.0/16',
4369 'PS': '188.161.0.0/16',
4370 'PT': '85.240.0.0/13',
4371 'PW': '202.124.224.0/20',
4372 'PY': '181.120.0.0/14',
4373 'QA': '37.210.0.0/15',
4374 'RE': '102.35.0.0/16',
4375 'RO': '79.112.0.0/13',
4376 'RS': '93.86.0.0/15',
4377 'RU': '5.136.0.0/13',
4378 'RW': '41.186.0.0/16',
4379 'SA': '188.48.0.0/13',
4380 'SB': '202.1.160.0/19',
4381 'SC': '154.192.0.0/11',
4382 'SD': '102.120.0.0/13',
4383 'SE': '78.64.0.0/12',
4384 'SG': '8.128.0.0/10',
4385 'SI': '188.196.0.0/14',
4386 'SK': '78.98.0.0/15',
4387 'SL': '102.143.0.0/17',
4388 'SM': '89.186.32.0/19',
4389 'SN': '41.82.0.0/15',
4390 'SO': '154.115.192.0/18',
4391 'SR': '186.179.128.0/17',
4392 'SS': '105.235.208.0/21',
4393 'ST': '197.159.160.0/19',
4394 'SV': '168.243.0.0/16',
4395 'SX': '190.102.0.0/20',
4396 'SY': '5.0.0.0/16',
4397 'SZ': '41.84.224.0/19',
4398 'TC': '65.255.48.0/20',
4399 'TD': '154.68.128.0/19',
4400 'TG': '196.168.0.0/14',
4401 'TH': '171.96.0.0/13',
4402 'TJ': '85.9.128.0/18',
4403 'TK': '27.96.24.0/21',
4404 'TL': '180.189.160.0/20',
4405 'TM': '95.85.96.0/19',
4406 'TN': '197.0.0.0/11',
4407 'TO': '175.176.144.0/21',
4408 'TR': '78.160.0.0/11',
4409 'TT': '186.44.0.0/15',
4410 'TV': '202.2.96.0/19',
4411 'TW': '120.96.0.0/11',
4412 'TZ': '156.156.0.0/14',
4413 'UA': '37.52.0.0/14',
4414 'UG': '102.80.0.0/13',
4415 'US': '6.0.0.0/8',
4416 'UY': '167.56.0.0/13',
4417 'UZ': '84.54.64.0/18',
4418 'VA': '212.77.0.0/19',
4419 'VC': '207.191.240.0/21',
4420 'VE': '186.88.0.0/13',
4421 'VG': '66.81.192.0/20',
4422 'VI': '146.226.0.0/16',
4423 'VN': '14.160.0.0/11',
4424 'VU': '202.80.32.0/20',
4425 'WF': '117.20.32.0/21',
4426 'WS': '202.4.32.0/19',
4427 'YE': '134.35.0.0/16',
4428 'YT': '41.242.116.0/22',
4429 'ZA': '41.0.0.0/11',
4430 'ZM': '102.144.0.0/13',
4431 'ZW': '102.177.192.0/18',
4432 }
4433
4434 @classmethod
4435 def random_ipv4(cls, code_or_block):
4436 if len(code_or_block) == 2:
4437 block = cls._country_ip_map.get(code_or_block.upper())
4438 if not block:
4439 return None
4440 else:
4441 block = code_or_block
4442 addr, preflen = block.split('/')
4443 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4444 addr_max = addr_min | (0xffffffff >> int(preflen))
4445 return compat_str(socket.inet_ntoa(
4446 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4447
4448
4449 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4450 def __init__(self, proxies=None):
4451 # Set default handlers
4452 for type in ('http', 'https'):
4453 setattr(self, '%s_open' % type,
4454 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4455 meth(r, proxy, type))
4456 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4457
4458 def proxy_open(self, req, proxy, type):
4459 req_proxy = req.headers.get('Ytdl-request-proxy')
4460 if req_proxy is not None:
4461 proxy = req_proxy
4462 del req.headers['Ytdl-request-proxy']
4463
4464 if proxy == '__noproxy__':
4465 return None # No Proxy
4466 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4467 req.add_header('Ytdl-socks-proxy', proxy)
4468 # yt-dlp's http/https handlers do wrapping the socket with socks
4469 return None
4470 return compat_urllib_request.ProxyHandler.proxy_open(
4471 self, req, proxy, type)
4472
4473
4474 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4475 # released into Public Domain
4476 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4477
4478 def long_to_bytes(n, blocksize=0):
4479 """long_to_bytes(n:long, blocksize:int) : string
4480 Convert a long integer to a byte string.
4481
4482 If optional blocksize is given and greater than zero, pad the front of the
4483 byte string with binary zeros so that the length is a multiple of
4484 blocksize.
4485 """
4486 # after much testing, this algorithm was deemed to be the fastest
4487 s = b''
4488 n = int(n)
4489 while n > 0:
4490 s = compat_struct_pack('>I', n & 0xffffffff) + s
4491 n = n >> 32
4492 # strip off leading zeros
4493 for i in range(len(s)):
4494 if s[i] != b'\000'[0]:
4495 break
4496 else:
4497 # only happens when n == 0
4498 s = b'\000'
4499 i = 0
4500 s = s[i:]
4501 # add back some pad bytes. this could be done more efficiently w.r.t. the
4502 # de-padding being done above, but sigh...
4503 if blocksize > 0 and len(s) % blocksize:
4504 s = (blocksize - len(s) % blocksize) * b'\000' + s
4505 return s
4506
4507
4508 def bytes_to_long(s):
4509 """bytes_to_long(string) : long
4510 Convert a byte string to a long integer.
4511
4512 This is (essentially) the inverse of long_to_bytes().
4513 """
4514 acc = 0
4515 length = len(s)
4516 if length % 4:
4517 extra = (4 - length % 4)
4518 s = b'\000' * extra + s
4519 length = length + extra
4520 for i in range(0, length, 4):
4521 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4522 return acc
4523
4524
4525 def ohdave_rsa_encrypt(data, exponent, modulus):
4526 '''
4527 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4528
4529 Input:
4530 data: data to encrypt, bytes-like object
4531 exponent, modulus: parameter e and N of RSA algorithm, both integer
4532 Output: hex string of encrypted data
4533
4534 Limitation: supports one block encryption only
4535 '''
4536
4537 payload = int(binascii.hexlify(data[::-1]), 16)
4538 encrypted = pow(payload, exponent, modulus)
4539 return '%x' % encrypted
4540
4541
4542 def pkcs1pad(data, length):
4543 """
4544 Padding input data with PKCS#1 scheme
4545
4546 @param {int[]} data input data
4547 @param {int} length target length
4548 @returns {int[]} padded data
4549 """
4550 if len(data) > length - 11:
4551 raise ValueError('Input data too long for PKCS#1 padding')
4552
4553 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4554 return [0, 2] + pseudo_random + [0] + data
4555
4556
4557 def encode_base_n(num, n, table=None):
4558 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4559 if not table:
4560 table = FULL_TABLE[:n]
4561
4562 if n > len(table):
4563 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4564
4565 if num == 0:
4566 return table[0]
4567
4568 ret = ''
4569 while num:
4570 ret = table[num % n] + ret
4571 num = num // n
4572 return ret
4573
4574
4575 def decode_packed_codes(code):
4576 mobj = re.search(PACKED_CODES_RE, code)
4577 obfuscated_code, base, count, symbols = mobj.groups()
4578 base = int(base)
4579 count = int(count)
4580 symbols = symbols.split('|')
4581 symbol_table = {}
4582
4583 while count:
4584 count -= 1
4585 base_n_count = encode_base_n(count, base)
4586 symbol_table[base_n_count] = symbols[count] or base_n_count
4587
4588 return re.sub(
4589 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4590 obfuscated_code)
4591
4592
4593 def caesar(s, alphabet, shift):
4594 if shift == 0:
4595 return s
4596 l = len(alphabet)
4597 return ''.join(
4598 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4599 for c in s)
4600
4601
4602 def rot47(s):
4603 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4604
4605
4606 def parse_m3u8_attributes(attrib):
4607 info = {}
4608 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4609 if val.startswith('"'):
4610 val = val[1:-1]
4611 info[key] = val
4612 return info
4613
4614
4615 def urshift(val, n):
4616 return val >> n if val >= 0 else (val + 0x100000000) >> n
4617
4618
4619 # Based on png2str() written by @gdkchan and improved by @yokrysty
4620 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4621 def decode_png(png_data):
4622 # Reference: https://www.w3.org/TR/PNG/
4623 header = png_data[8:]
4624
4625 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4626 raise OSError('Not a valid PNG file.')
4627
4628 int_map = {1: '>B', 2: '>H', 4: '>I'}
4629 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4630
4631 chunks = []
4632
4633 while header:
4634 length = unpack_integer(header[:4])
4635 header = header[4:]
4636
4637 chunk_type = header[:4]
4638 header = header[4:]
4639
4640 chunk_data = header[:length]
4641 header = header[length:]
4642
4643 header = header[4:] # Skip CRC
4644
4645 chunks.append({
4646 'type': chunk_type,
4647 'length': length,
4648 'data': chunk_data
4649 })
4650
4651 ihdr = chunks[0]['data']
4652
4653 width = unpack_integer(ihdr[:4])
4654 height = unpack_integer(ihdr[4:8])
4655
4656 idat = b''
4657
4658 for chunk in chunks:
4659 if chunk['type'] == b'IDAT':
4660 idat += chunk['data']
4661
4662 if not idat:
4663 raise OSError('Unable to read PNG data.')
4664
4665 decompressed_data = bytearray(zlib.decompress(idat))
4666
4667 stride = width * 3
4668 pixels = []
4669
4670 def _get_pixel(idx):
4671 x = idx % stride
4672 y = idx // stride
4673 return pixels[y][x]
4674
4675 for y in range(height):
4676 basePos = y * (1 + stride)
4677 filter_type = decompressed_data[basePos]
4678
4679 current_row = []
4680
4681 pixels.append(current_row)
4682
4683 for x in range(stride):
4684 color = decompressed_data[1 + basePos + x]
4685 basex = y * stride + x
4686 left = 0
4687 up = 0
4688
4689 if x > 2:
4690 left = _get_pixel(basex - 3)
4691 if y > 0:
4692 up = _get_pixel(basex - stride)
4693
4694 if filter_type == 1: # Sub
4695 color = (color + left) & 0xff
4696 elif filter_type == 2: # Up
4697 color = (color + up) & 0xff
4698 elif filter_type == 3: # Average
4699 color = (color + ((left + up) >> 1)) & 0xff
4700 elif filter_type == 4: # Paeth
4701 a = left
4702 b = up
4703 c = 0
4704
4705 if x > 2 and y > 0:
4706 c = _get_pixel(basex - stride - 3)
4707
4708 p = a + b - c
4709
4710 pa = abs(p - a)
4711 pb = abs(p - b)
4712 pc = abs(p - c)
4713
4714 if pa <= pb and pa <= pc:
4715 color = (color + a) & 0xff
4716 elif pb <= pc:
4717 color = (color + b) & 0xff
4718 else:
4719 color = (color + c) & 0xff
4720
4721 current_row.append(color)
4722
4723 return width, height, pixels
4724
4725
4726 def write_xattr(path, key, value):
4727 # Windows: Write xattrs to NTFS Alternate Data Streams:
4728 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4729 if compat_os_name == 'nt':
4730 assert ':' not in key
4731 assert os.path.exists(path)
4732
4733 try:
4734 with open(f'{path}:{key}', 'wb') as f:
4735 f.write(value)
4736 except OSError as e:
4737 raise XAttrMetadataError(e.errno, e.strerror)
4738 return
4739
4740 # UNIX Method 1. Use xattrs/pyxattrs modules
4741 from .dependencies import xattr
4742
4743 setxattr = None
4744 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4745 # Unicode arguments are not supported in pyxattr until version 0.5.0
4746 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4747 if version_tuple(xattr.__version__) >= (0, 5, 0):
4748 setxattr = xattr.set
4749 elif xattr:
4750 setxattr = xattr.setxattr
4751
4752 if setxattr:
4753 try:
4754 setxattr(path, key, value)
4755 except OSError as e:
4756 raise XAttrMetadataError(e.errno, e.strerror)
4757 return
4758
4759 # UNIX Method 2. Use setfattr/xattr executables
4760 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4761 else 'xattr' if check_executable('xattr', ['-h']) else None)
4762 if not exe:
4763 raise XAttrUnavailableError(
4764 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4765 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4766
4767 value = value.decode()
4768 try:
4769 p = Popen(
4770 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4771 stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4772 except OSError as e:
4773 raise XAttrMetadataError(e.errno, e.strerror)
4774 stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace')
4775 if p.returncode:
4776 raise XAttrMetadataError(p.returncode, stderr)
4777
4778
4779 def random_birthday(year_field, month_field, day_field):
4780 start_date = datetime.date(1950, 1, 1)
4781 end_date = datetime.date(1995, 12, 31)
4782 offset = random.randint(0, (end_date - start_date).days)
4783 random_date = start_date + datetime.timedelta(offset)
4784 return {
4785 year_field: str(random_date.year),
4786 month_field: str(random_date.month),
4787 day_field: str(random_date.day),
4788 }
4789
4790
4791 # Templates for internet shortcut files, which are plain text files.
4792 DOT_URL_LINK_TEMPLATE = '''\
4793 [InternetShortcut]
4794 URL=%(url)s
4795 '''
4796
4797 DOT_WEBLOC_LINK_TEMPLATE = '''\
4798 <?xml version="1.0" encoding="UTF-8"?>
4799 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4800 <plist version="1.0">
4801 <dict>
4802 \t<key>URL</key>
4803 \t<string>%(url)s</string>
4804 </dict>
4805 </plist>
4806 '''
4807
4808 DOT_DESKTOP_LINK_TEMPLATE = '''\
4809 [Desktop Entry]
4810 Encoding=UTF-8
4811 Name=%(filename)s
4812 Type=Link
4813 URL=%(url)s
4814 Icon=text-html
4815 '''
4816
4817 LINK_TEMPLATES = {
4818 'url': DOT_URL_LINK_TEMPLATE,
4819 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4820 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4821 }
4822
4823
4824 def iri_to_uri(iri):
4825 """
4826 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4827
4828 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4829 """
4830
4831 iri_parts = compat_urllib_parse_urlparse(iri)
4832
4833 if '[' in iri_parts.netloc:
4834 raise ValueError('IPv6 URIs are not, yet, supported.')
4835 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4836
4837 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4838
4839 net_location = ''
4840 if iri_parts.username:
4841 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4842 if iri_parts.password is not None:
4843 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4844 net_location += '@'
4845
4846 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
4847 # The 'idna' encoding produces ASCII text.
4848 if iri_parts.port is not None and iri_parts.port != 80:
4849 net_location += ':' + str(iri_parts.port)
4850
4851 return urllib.parse.urlunparse(
4852 (iri_parts.scheme,
4853 net_location,
4854
4855 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4856
4857 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4858 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4859
4860 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4861 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4862
4863 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4864
4865 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4866
4867
4868 def to_high_limit_path(path):
4869 if sys.platform in ['win32', 'cygwin']:
4870 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4871 return '\\\\?\\' + os.path.abspath(path)
4872
4873 return path
4874
4875
4876 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4877 val = traverse_obj(obj, *variadic(field))
4878 if val in ignore:
4879 return default
4880 return template % (func(val) if func else val)
4881
4882
4883 def clean_podcast_url(url):
4884 return re.sub(r'''(?x)
4885 (?:
4886 (?:
4887 chtbl\.com/track|
4888 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4889 play\.podtrac\.com
4890 )/[^/]+|
4891 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4892 flex\.acast\.com|
4893 pd(?:
4894 cn\.co| # https://podcorn.com/analytics-prefix/
4895 st\.fm # https://podsights.com/docs/
4896 )/e
4897 )/''', '', url)
4898
4899
4900 _HEX_TABLE = '0123456789abcdef'
4901
4902
4903 def random_uuidv4():
4904 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4905
4906
4907 def make_dir(path, to_screen=None):
4908 try:
4909 dn = os.path.dirname(path)
4910 if dn and not os.path.exists(dn):
4911 os.makedirs(dn)
4912 return True
4913 except OSError as err:
4914 if callable(to_screen) is not None:
4915 to_screen('unable to create directory ' + error_to_compat_str(err))
4916 return False
4917
4918
4919 def get_executable_path():
4920 from .update import _get_variant_and_executable_path
4921
4922 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
4923
4924
4925 def load_plugins(name, suffix, namespace):
4926 classes = {}
4927 with contextlib.suppress(FileNotFoundError):
4928 plugins_spec = importlib.util.spec_from_file_location(
4929 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4930 plugins = importlib.util.module_from_spec(plugins_spec)
4931 sys.modules[plugins_spec.name] = plugins
4932 plugins_spec.loader.exec_module(plugins)
4933 for name in dir(plugins):
4934 if name in namespace:
4935 continue
4936 if not name.endswith(suffix):
4937 continue
4938 klass = getattr(plugins, name)
4939 classes[name] = namespace[name] = klass
4940 return classes
4941
4942
4943 def traverse_obj(
4944 obj, *path_list, default=None, expected_type=None, get_all=True,
4945 casesense=True, is_user_input=False, traverse_string=False):
4946 ''' Traverse nested list/dict/tuple
4947 @param path_list A list of paths which are checked one by one.
4948 Each path is a list of keys where each key is a:
4949 - None: Do nothing
4950 - string: A dictionary key
4951 - int: An index into a list
4952 - tuple: A list of keys all of which will be traversed
4953 - Ellipsis: Fetch all values in the object
4954 - Function: Takes the key and value as arguments
4955 and returns whether the key matches or not
4956 @param default Default value to return
4957 @param expected_type Only accept final value of this type (Can also be any callable)
4958 @param get_all Return all the values obtained from a path or only the first one
4959 @param casesense Whether to consider dictionary keys as case sensitive
4960 @param is_user_input Whether the keys are generated from user input. If True,
4961 strings are converted to int/slice if necessary
4962 @param traverse_string Whether to traverse inside strings. If True, any
4963 non-compatible object will also be converted into a string
4964 # TODO: Write tests
4965 '''
4966 if not casesense:
4967 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4968 path_list = (map(_lower, variadic(path)) for path in path_list)
4969
4970 def _traverse_obj(obj, path, _current_depth=0):
4971 nonlocal depth
4972 path = tuple(variadic(path))
4973 for i, key in enumerate(path):
4974 if None in (key, obj):
4975 return obj
4976 if isinstance(key, (list, tuple)):
4977 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4978 key = ...
4979 if key is ...:
4980 obj = (obj.values() if isinstance(obj, dict)
4981 else obj if isinstance(obj, (list, tuple, LazyList))
4982 else str(obj) if traverse_string else [])
4983 _current_depth += 1
4984 depth = max(depth, _current_depth)
4985 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4986 elif callable(key):
4987 if isinstance(obj, (list, tuple, LazyList)):
4988 obj = enumerate(obj)
4989 elif isinstance(obj, dict):
4990 obj = obj.items()
4991 else:
4992 if not traverse_string:
4993 return None
4994 obj = str(obj)
4995 _current_depth += 1
4996 depth = max(depth, _current_depth)
4997 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
4998 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4999 obj = (obj.get(key) if casesense or (key in obj)
5000 else next((v for k, v in obj.items() if _lower(k) == key), None))
5001 else:
5002 if is_user_input:
5003 key = (int_or_none(key) if ':' not in key
5004 else slice(*map(int_or_none, key.split(':'))))
5005 if key == slice(None):
5006 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5007 if not isinstance(key, (int, slice)):
5008 return None
5009 if not isinstance(obj, (list, tuple, LazyList)):
5010 if not traverse_string:
5011 return None
5012 obj = str(obj)
5013 try:
5014 obj = obj[key]
5015 except IndexError:
5016 return None
5017 return obj
5018
5019 if isinstance(expected_type, type):
5020 type_test = lambda val: val if isinstance(val, expected_type) else None
5021 elif expected_type is not None:
5022 type_test = expected_type
5023 else:
5024 type_test = lambda val: val
5025
5026 for path in path_list:
5027 depth = 0
5028 val = _traverse_obj(obj, path)
5029 if val is not None:
5030 if depth:
5031 for _ in range(depth - 1):
5032 val = itertools.chain.from_iterable(v for v in val if v is not None)
5033 val = [v for v in map(type_test, val) if v is not None]
5034 if val:
5035 return val if get_all else val[0]
5036 else:
5037 val = type_test(val)
5038 if val is not None:
5039 return val
5040 return default
5041
5042
5043 def traverse_dict(dictn, keys, casesense=True):
5044 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5045 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5046 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5047
5048
5049 def get_first(obj, keys, **kwargs):
5050 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5051
5052
5053 def variadic(x, allowed_types=(str, bytes, dict)):
5054 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5055
5056
5057 def decode_base(value, digits):
5058 # This will convert given base-x string to scalar (long or int)
5059 table = {char: index for index, char in enumerate(digits)}
5060 result = 0
5061 base = len(digits)
5062 for chr in value:
5063 result *= base
5064 result += table[chr]
5065 return result
5066
5067
5068 def time_seconds(**kwargs):
5069 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5070 return t.timestamp()
5071
5072
5073 # create a JSON Web Signature (jws) with HS256 algorithm
5074 # the resulting format is in JWS Compact Serialization
5075 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5076 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5077 def jwt_encode_hs256(payload_data, key, headers={}):
5078 header_data = {
5079 'alg': 'HS256',
5080 'typ': 'JWT',
5081 }
5082 if headers:
5083 header_data.update(headers)
5084 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5085 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5086 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5087 signature_b64 = base64.b64encode(h.digest())
5088 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5089 return token
5090
5091
5092 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5093 def jwt_decode_hs256(jwt):
5094 header_b64, payload_b64, signature_b64 = jwt.split('.')
5095 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5096 return payload_data
5097
5098
5099 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5100
5101
5102 @functools.cache
5103 def supports_terminal_sequences(stream):
5104 if compat_os_name == 'nt':
5105 if not WINDOWS_VT_MODE:
5106 return False
5107 elif not os.getenv('TERM'):
5108 return False
5109 try:
5110 return stream.isatty()
5111 except BaseException:
5112 return False
5113
5114
5115 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5116 if get_windows_version() < (10, 0, 10586):
5117 return
5118 global WINDOWS_VT_MODE
5119 startupinfo = subprocess.STARTUPINFO()
5120 startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
5121 try:
5122 subprocess.Popen('', shell=True, startupinfo=startupinfo).wait()
5123 except Exception:
5124 return
5125
5126 WINDOWS_VT_MODE = True
5127 supports_terminal_sequences.cache_clear()
5128
5129
5130 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5131
5132
5133 def remove_terminal_sequences(string):
5134 return _terminal_sequences_re.sub('', string)
5135
5136
5137 def number_of_digits(number):
5138 return len('%d' % number)
5139
5140
5141 def join_nonempty(*values, delim='-', from_dict=None):
5142 if from_dict is not None:
5143 values = map(from_dict.get, values)
5144 return delim.join(map(str, filter(None, values)))
5145
5146
5147 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5148 """
5149 Find the largest format dimensions in terms of video width and, for each thumbnail:
5150 * Modify the URL: Match the width with the provided regex and replace with the former width
5151 * Update dimensions
5152
5153 This function is useful with video services that scale the provided thumbnails on demand
5154 """
5155 _keys = ('width', 'height')
5156 max_dimensions = max(
5157 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5158 default=(0, 0))
5159 if not max_dimensions[0]:
5160 return thumbnails
5161 return [
5162 merge_dicts(
5163 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5164 dict(zip(_keys, max_dimensions)), thumbnail)
5165 for thumbnail in thumbnails
5166 ]
5167
5168
5169 def parse_http_range(range):
5170 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5171 if not range:
5172 return None, None, None
5173 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5174 if not crg:
5175 return None, None, None
5176 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5177
5178
5179 def read_stdin(what):
5180 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5181 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5182 return sys.stdin
5183
5184
5185 class Config:
5186 own_args = None
5187 parsed_args = None
5188 filename = None
5189 __initialized = False
5190
5191 def __init__(self, parser, label=None):
5192 self.parser, self.label = parser, label
5193 self._loaded_paths, self.configs = set(), []
5194
5195 def init(self, args=None, filename=None):
5196 assert not self.__initialized
5197 directory = ''
5198 if filename:
5199 location = os.path.realpath(filename)
5200 directory = os.path.dirname(location)
5201 if location in self._loaded_paths:
5202 return False
5203 self._loaded_paths.add(location)
5204
5205 self.own_args, self.__initialized = args, True
5206 opts, _ = self.parser.parse_known_args(args)
5207 self.parsed_args, self.filename = args, filename
5208
5209 for location in opts.config_locations or []:
5210 if location == '-':
5211 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5212 continue
5213 location = os.path.join(directory, expand_path(location))
5214 if os.path.isdir(location):
5215 location = os.path.join(location, 'yt-dlp.conf')
5216 if not os.path.exists(location):
5217 self.parser.error(f'config location {location} does not exist')
5218 self.append_config(self.read_file(location), location)
5219 return True
5220
5221 def __str__(self):
5222 label = join_nonempty(
5223 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5224 delim=' ')
5225 return join_nonempty(
5226 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5227 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5228 delim='\n')
5229
5230 @staticmethod
5231 def read_file(filename, default=[]):
5232 try:
5233 optionf = open(filename)
5234 except OSError:
5235 return default # silently skip if file is not present
5236 try:
5237 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5238 contents = optionf.read()
5239 res = shlex.split(contents, comments=True)
5240 finally:
5241 optionf.close()
5242 return res
5243
5244 @staticmethod
5245 def hide_login_info(opts):
5246 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5247 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5248
5249 def _scrub_eq(o):
5250 m = eqre.match(o)
5251 if m:
5252 return m.group('key') + '=PRIVATE'
5253 else:
5254 return o
5255
5256 opts = list(map(_scrub_eq, opts))
5257 for idx, opt in enumerate(opts):
5258 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5259 opts[idx + 1] = 'PRIVATE'
5260 return opts
5261
5262 def append_config(self, *args, label=None):
5263 config = type(self)(self.parser, label)
5264 config._loaded_paths = self._loaded_paths
5265 if config.init(*args):
5266 self.configs.append(config)
5267
5268 @property
5269 def all_args(self):
5270 for config in reversed(self.configs):
5271 yield from config.all_args
5272 yield from self.parsed_args or []
5273
5274 def parse_known_args(self, **kwargs):
5275 return self.parser.parse_known_args(self.all_args, **kwargs)
5276
5277 def parse_args(self):
5278 return self.parser.parse_args(self.all_args)
5279
5280
5281 class WebSocketsWrapper():
5282 """Wraps websockets module to use in non-async scopes"""
5283 pool = None
5284
5285 def __init__(self, url, headers=None, connect=True):
5286 self.loop = asyncio.new_event_loop()
5287 # XXX: "loop" is deprecated
5288 self.conn = websockets.connect(
5289 url, extra_headers=headers, ping_interval=None,
5290 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5291 if connect:
5292 self.__enter__()
5293 atexit.register(self.__exit__, None, None, None)
5294
5295 def __enter__(self):
5296 if not self.pool:
5297 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5298 return self
5299
5300 def send(self, *args):
5301 self.run_with_loop(self.pool.send(*args), self.loop)
5302
5303 def recv(self, *args):
5304 return self.run_with_loop(self.pool.recv(*args), self.loop)
5305
5306 def __exit__(self, type, value, traceback):
5307 try:
5308 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5309 finally:
5310 self.loop.close()
5311 self._cancel_all_tasks(self.loop)
5312
5313 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5314 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5315 @staticmethod
5316 def run_with_loop(main, loop):
5317 if not asyncio.iscoroutine(main):
5318 raise ValueError(f'a coroutine was expected, got {main!r}')
5319
5320 try:
5321 return loop.run_until_complete(main)
5322 finally:
5323 loop.run_until_complete(loop.shutdown_asyncgens())
5324 if hasattr(loop, 'shutdown_default_executor'):
5325 loop.run_until_complete(loop.shutdown_default_executor())
5326
5327 @staticmethod
5328 def _cancel_all_tasks(loop):
5329 to_cancel = asyncio.all_tasks(loop)
5330
5331 if not to_cancel:
5332 return
5333
5334 for task in to_cancel:
5335 task.cancel()
5336
5337 # XXX: "loop" is removed in python 3.10+
5338 loop.run_until_complete(
5339 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5340
5341 for task in to_cancel:
5342 if task.cancelled():
5343 continue
5344 if task.exception() is not None:
5345 loop.call_exception_handler({
5346 'message': 'unhandled exception during asyncio.run() shutdown',
5347 'exception': task.exception(),
5348 'task': task,
5349 })
5350
5351
5352 def merge_headers(*dicts):
5353 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5354 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5355
5356
5357 class classproperty:
5358 """classmethod(property(func)) that works in py < 3.9"""
5359
5360 def __init__(self, func):
5361 functools.update_wrapper(self, func)
5362 self.func = func
5363
5364 def __get__(self, _, cls):
5365 return self.func(cls)
5366
5367
5368 class Namespace:
5369 """Immutable namespace"""
5370
5371 def __init__(self, **kwargs):
5372 self._dict = kwargs
5373
5374 def __getattr__(self, attr):
5375 return self._dict[attr]
5376
5377 def __contains__(self, item):
5378 return item in self._dict.values()
5379
5380 def __iter__(self):
5381 return iter(self._dict.items())
5382
5383 def __repr__(self):
5384 return f'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
5385
5386
5387 # Deprecated
5388 has_certifi = bool(certifi)
5389 has_websockets = bool(websockets)