]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[compat] Remove deprecated functions from core code
[yt-dlp.git] / yt_dlp / utils.py
1 import atexit
2 import base64
3 import binascii
4 import calendar
5 import codecs
6 import collections
7 import contextlib
8 import ctypes
9 import datetime
10 import email.header
11 import email.utils
12 import errno
13 import gzip
14 import hashlib
15 import hmac
16 import html.entities
17 import html.parser
18 import http.client
19 import http.cookiejar
20 import importlib.util
21 import io
22 import itertools
23 import json
24 import locale
25 import math
26 import mimetypes
27 import operator
28 import os
29 import platform
30 import random
31 import re
32 import shlex
33 import socket
34 import ssl
35 import struct
36 import subprocess
37 import sys
38 import tempfile
39 import time
40 import traceback
41 import types
42 import urllib.error
43 import urllib.parse
44 import urllib.request
45 import xml.etree.ElementTree
46 import zlib
47
48 from .compat import asyncio, functools # isort: split
49 from .compat import (
50 compat_etree_fromstring,
51 compat_expanduser,
52 compat_HTMLParseError,
53 compat_os_name,
54 compat_shlex_quote,
55 )
56 from .dependencies import brotli, certifi, websockets, xattr
57 from .socks import ProxyType, sockssocket
58
59
60 def register_socks_protocols():
61 # "Register" SOCKS protocols
62 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
63 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
64 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
65 if scheme not in urllib.parse.uses_netloc:
66 urllib.parse.uses_netloc.append(scheme)
67
68
69 # This is not clearly defined otherwise
70 compiled_regex_type = type(re.compile(''))
71
72
73 def random_user_agent():
74 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
75 _CHROME_VERSIONS = (
76 '90.0.4430.212',
77 '90.0.4430.24',
78 '90.0.4430.70',
79 '90.0.4430.72',
80 '90.0.4430.85',
81 '90.0.4430.93',
82 '91.0.4472.101',
83 '91.0.4472.106',
84 '91.0.4472.114',
85 '91.0.4472.124',
86 '91.0.4472.164',
87 '91.0.4472.19',
88 '91.0.4472.77',
89 '92.0.4515.107',
90 '92.0.4515.115',
91 '92.0.4515.131',
92 '92.0.4515.159',
93 '92.0.4515.43',
94 '93.0.4556.0',
95 '93.0.4577.15',
96 '93.0.4577.63',
97 '93.0.4577.82',
98 '94.0.4606.41',
99 '94.0.4606.54',
100 '94.0.4606.61',
101 '94.0.4606.71',
102 '94.0.4606.81',
103 '94.0.4606.85',
104 '95.0.4638.17',
105 '95.0.4638.50',
106 '95.0.4638.54',
107 '95.0.4638.69',
108 '95.0.4638.74',
109 '96.0.4664.18',
110 '96.0.4664.45',
111 '96.0.4664.55',
112 '96.0.4664.93',
113 '97.0.4692.20',
114 )
115 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
116
117
118 SUPPORTED_ENCODINGS = [
119 'gzip', 'deflate'
120 ]
121 if brotli:
122 SUPPORTED_ENCODINGS.append('br')
123
124 std_headers = {
125 'User-Agent': random_user_agent(),
126 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
127 'Accept-Language': 'en-us,en;q=0.5',
128 'Sec-Fetch-Mode': 'navigate',
129 }
130
131
132 USER_AGENTS = {
133 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
134 }
135
136
137 NO_DEFAULT = object()
138 IDENTITY = lambda x: x
139
140 ENGLISH_MONTH_NAMES = [
141 'January', 'February', 'March', 'April', 'May', 'June',
142 'July', 'August', 'September', 'October', 'November', 'December']
143
144 MONTH_NAMES = {
145 'en': ENGLISH_MONTH_NAMES,
146 'fr': [
147 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
148 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
149 }
150
151 KNOWN_EXTENSIONS = (
152 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
153 'flv', 'f4v', 'f4a', 'f4b',
154 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
155 'mkv', 'mka', 'mk3d',
156 'avi', 'divx',
157 'mov',
158 'asf', 'wmv', 'wma',
159 '3gp', '3g2',
160 'mp3',
161 'flac',
162 'ape',
163 'wav',
164 'f4f', 'f4m', 'm3u8', 'smil')
165
166 # needed for sanitizing filenames in restricted mode
167 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
168 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
169 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
170
171 DATE_FORMATS = (
172 '%d %B %Y',
173 '%d %b %Y',
174 '%B %d %Y',
175 '%B %dst %Y',
176 '%B %dnd %Y',
177 '%B %drd %Y',
178 '%B %dth %Y',
179 '%b %d %Y',
180 '%b %dst %Y',
181 '%b %dnd %Y',
182 '%b %drd %Y',
183 '%b %dth %Y',
184 '%b %dst %Y %I:%M',
185 '%b %dnd %Y %I:%M',
186 '%b %drd %Y %I:%M',
187 '%b %dth %Y %I:%M',
188 '%Y %m %d',
189 '%Y-%m-%d',
190 '%Y.%m.%d.',
191 '%Y/%m/%d',
192 '%Y/%m/%d %H:%M',
193 '%Y/%m/%d %H:%M:%S',
194 '%Y%m%d%H%M',
195 '%Y%m%d%H%M%S',
196 '%Y%m%d',
197 '%Y-%m-%d %H:%M',
198 '%Y-%m-%d %H:%M:%S',
199 '%Y-%m-%d %H:%M:%S.%f',
200 '%Y-%m-%d %H:%M:%S:%f',
201 '%d.%m.%Y %H:%M',
202 '%d.%m.%Y %H.%M',
203 '%Y-%m-%dT%H:%M:%SZ',
204 '%Y-%m-%dT%H:%M:%S.%fZ',
205 '%Y-%m-%dT%H:%M:%S.%f0Z',
206 '%Y-%m-%dT%H:%M:%S',
207 '%Y-%m-%dT%H:%M:%S.%f',
208 '%Y-%m-%dT%H:%M',
209 '%b %d %Y at %H:%M',
210 '%b %d %Y at %H:%M:%S',
211 '%B %d %Y at %H:%M',
212 '%B %d %Y at %H:%M:%S',
213 '%H:%M %d-%b-%Y',
214 )
215
216 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
217 DATE_FORMATS_DAY_FIRST.extend([
218 '%d-%m-%Y',
219 '%d.%m.%Y',
220 '%d.%m.%y',
221 '%d/%m/%Y',
222 '%d/%m/%y',
223 '%d/%m/%Y %H:%M:%S',
224 ])
225
226 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
227 DATE_FORMATS_MONTH_FIRST.extend([
228 '%m-%d-%Y',
229 '%m.%d.%Y',
230 '%m/%d/%Y',
231 '%m/%d/%y',
232 '%m/%d/%Y %H:%M:%S',
233 ])
234
235 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
236 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
237
238 NUMBER_RE = r'\d+(?:\.\d+)?'
239
240
241 @functools.cache
242 def preferredencoding():
243 """Get preferred encoding.
244
245 Returns the best encoding scheme for the system, based on
246 locale.getpreferredencoding() and some further tweaks.
247 """
248 try:
249 pref = locale.getpreferredencoding()
250 'TEST'.encode(pref)
251 except Exception:
252 pref = 'UTF-8'
253
254 return pref
255
256
257 def write_json_file(obj, fn):
258 """ Encode obj as JSON and write it to fn, atomically if possible """
259
260 tf = tempfile.NamedTemporaryFile(
261 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
262 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
263
264 try:
265 with tf:
266 json.dump(obj, tf, ensure_ascii=False)
267 if sys.platform == 'win32':
268 # Need to remove existing file on Windows, else os.rename raises
269 # WindowsError or FileExistsError.
270 with contextlib.suppress(OSError):
271 os.unlink(fn)
272 with contextlib.suppress(OSError):
273 mask = os.umask(0)
274 os.umask(mask)
275 os.chmod(tf.name, 0o666 & ~mask)
276 os.rename(tf.name, fn)
277 except Exception:
278 with contextlib.suppress(OSError):
279 os.remove(tf.name)
280 raise
281
282
283 def find_xpath_attr(node, xpath, key, val=None):
284 """ Find the xpath xpath[@key=val] """
285 assert re.match(r'^[a-zA-Z_-]+$', key)
286 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
287 return node.find(expr)
288
289 # On python2.6 the xml.etree.ElementTree.Element methods don't support
290 # the namespace parameter
291
292
293 def xpath_with_ns(path, ns_map):
294 components = [c.split(':') for c in path.split('/')]
295 replaced = []
296 for c in components:
297 if len(c) == 1:
298 replaced.append(c[0])
299 else:
300 ns, tag = c
301 replaced.append('{%s}%s' % (ns_map[ns], tag))
302 return '/'.join(replaced)
303
304
305 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
306 def _find_xpath(xpath):
307 return node.find(xpath)
308
309 if isinstance(xpath, str):
310 n = _find_xpath(xpath)
311 else:
312 for xp in xpath:
313 n = _find_xpath(xp)
314 if n is not None:
315 break
316
317 if n is None:
318 if default is not NO_DEFAULT:
319 return default
320 elif fatal:
321 name = xpath if name is None else name
322 raise ExtractorError('Could not find XML element %s' % name)
323 else:
324 return None
325 return n
326
327
328 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
329 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
330 if n is None or n == default:
331 return n
332 if n.text is None:
333 if default is not NO_DEFAULT:
334 return default
335 elif fatal:
336 name = xpath if name is None else name
337 raise ExtractorError('Could not find XML element\'s text %s' % name)
338 else:
339 return None
340 return n.text
341
342
343 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
344 n = find_xpath_attr(node, xpath, key)
345 if n is None:
346 if default is not NO_DEFAULT:
347 return default
348 elif fatal:
349 name = f'{xpath}[@{key}]' if name is None else name
350 raise ExtractorError('Could not find XML attribute %s' % name)
351 else:
352 return None
353 return n.attrib[key]
354
355
356 def get_element_by_id(id, html, **kwargs):
357 """Return the content of the tag with the specified ID in the passed HTML document"""
358 return get_element_by_attribute('id', id, html, **kwargs)
359
360
361 def get_element_html_by_id(id, html, **kwargs):
362 """Return the html of the tag with the specified ID in the passed HTML document"""
363 return get_element_html_by_attribute('id', id, html, **kwargs)
364
365
366 def get_element_by_class(class_name, html):
367 """Return the content of the first tag with the specified class in the passed HTML document"""
368 retval = get_elements_by_class(class_name, html)
369 return retval[0] if retval else None
370
371
372 def get_element_html_by_class(class_name, html):
373 """Return the html of the first tag with the specified class in the passed HTML document"""
374 retval = get_elements_html_by_class(class_name, html)
375 return retval[0] if retval else None
376
377
378 def get_element_by_attribute(attribute, value, html, **kwargs):
379 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
380 return retval[0] if retval else None
381
382
383 def get_element_html_by_attribute(attribute, value, html, **kargs):
384 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
385 return retval[0] if retval else None
386
387
388 def get_elements_by_class(class_name, html, **kargs):
389 """Return the content of all tags with the specified class in the passed HTML document as a list"""
390 return get_elements_by_attribute(
391 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
392 html, escape_value=False)
393
394
395 def get_elements_html_by_class(class_name, html):
396 """Return the html of all tags with the specified class in the passed HTML document as a list"""
397 return get_elements_html_by_attribute(
398 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
399 html, escape_value=False)
400
401
402 def get_elements_by_attribute(*args, **kwargs):
403 """Return the content of the tag with the specified attribute in the passed HTML document"""
404 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
405
406
407 def get_elements_html_by_attribute(*args, **kwargs):
408 """Return the html of the tag with the specified attribute in the passed HTML document"""
409 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
410
411
412 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
413 """
414 Return the text (content) and the html (whole) of the tag with the specified
415 attribute in the passed HTML document
416 """
417
418 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
419
420 value = re.escape(value) if escape_value else value
421
422 partial_element_re = rf'''(?x)
423 <(?P<tag>[a-zA-Z0-9:._-]+)
424 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
425 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
426 '''
427
428 for m in re.finditer(partial_element_re, html):
429 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
430
431 yield (
432 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
433 whole
434 )
435
436
437 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
438 """
439 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
440 closing tag for the first opening tag it has encountered, and can be used
441 as a context manager
442 """
443
444 class HTMLBreakOnClosingTagException(Exception):
445 pass
446
447 def __init__(self):
448 self.tagstack = collections.deque()
449 html.parser.HTMLParser.__init__(self)
450
451 def __enter__(self):
452 return self
453
454 def __exit__(self, *_):
455 self.close()
456
457 def close(self):
458 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
459 # so data remains buffered; we no longer have any interest in it, thus
460 # override this method to discard it
461 pass
462
463 def handle_starttag(self, tag, _):
464 self.tagstack.append(tag)
465
466 def handle_endtag(self, tag):
467 if not self.tagstack:
468 raise compat_HTMLParseError('no tags in the stack')
469 while self.tagstack:
470 inner_tag = self.tagstack.pop()
471 if inner_tag == tag:
472 break
473 else:
474 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
475 if not self.tagstack:
476 raise self.HTMLBreakOnClosingTagException()
477
478
479 def get_element_text_and_html_by_tag(tag, html):
480 """
481 For the first element with the specified tag in the passed HTML document
482 return its' content (text) and the whole element (html)
483 """
484 def find_or_raise(haystack, needle, exc):
485 try:
486 return haystack.index(needle)
487 except ValueError:
488 raise exc
489 closing_tag = f'</{tag}>'
490 whole_start = find_or_raise(
491 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
492 content_start = find_or_raise(
493 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
494 content_start += whole_start + 1
495 with HTMLBreakOnClosingTagParser() as parser:
496 parser.feed(html[whole_start:content_start])
497 if not parser.tagstack or parser.tagstack[0] != tag:
498 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
499 offset = content_start
500 while offset < len(html):
501 next_closing_tag_start = find_or_raise(
502 html[offset:], closing_tag,
503 compat_HTMLParseError(f'closing {tag} tag not found'))
504 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
505 try:
506 parser.feed(html[offset:offset + next_closing_tag_end])
507 offset += next_closing_tag_end
508 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
509 return html[content_start:offset + next_closing_tag_start], \
510 html[whole_start:offset + next_closing_tag_end]
511 raise compat_HTMLParseError('unexpected end of html')
512
513
514 class HTMLAttributeParser(html.parser.HTMLParser):
515 """Trivial HTML parser to gather the attributes for a single element"""
516
517 def __init__(self):
518 self.attrs = {}
519 html.parser.HTMLParser.__init__(self)
520
521 def handle_starttag(self, tag, attrs):
522 self.attrs = dict(attrs)
523
524
525 class HTMLListAttrsParser(html.parser.HTMLParser):
526 """HTML parser to gather the attributes for the elements of a list"""
527
528 def __init__(self):
529 html.parser.HTMLParser.__init__(self)
530 self.items = []
531 self._level = 0
532
533 def handle_starttag(self, tag, attrs):
534 if tag == 'li' and self._level == 0:
535 self.items.append(dict(attrs))
536 self._level += 1
537
538 def handle_endtag(self, tag):
539 self._level -= 1
540
541
542 def extract_attributes(html_element):
543 """Given a string for an HTML element such as
544 <el
545 a="foo" B="bar" c="&98;az" d=boz
546 empty= noval entity="&amp;"
547 sq='"' dq="'"
548 >
549 Decode and return a dictionary of attributes.
550 {
551 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
552 'empty': '', 'noval': None, 'entity': '&',
553 'sq': '"', 'dq': '\''
554 }.
555 """
556 parser = HTMLAttributeParser()
557 with contextlib.suppress(compat_HTMLParseError):
558 parser.feed(html_element)
559 parser.close()
560 return parser.attrs
561
562
563 def parse_list(webpage):
564 """Given a string for an series of HTML <li> elements,
565 return a dictionary of their attributes"""
566 parser = HTMLListAttrsParser()
567 parser.feed(webpage)
568 parser.close()
569 return parser.items
570
571
572 def clean_html(html):
573 """Clean an HTML snippet into a readable string"""
574
575 if html is None: # Convenience for sanitizing descriptions etc.
576 return html
577
578 html = re.sub(r'\s+', ' ', html)
579 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
580 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
581 # Strip html tags
582 html = re.sub('<.*?>', '', html)
583 # Replace html entities
584 html = unescapeHTML(html)
585 return html.strip()
586
587
588 class LenientJSONDecoder(json.JSONDecoder):
589 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
590 self.transform_source, self.ignore_extra = transform_source, ignore_extra
591 super().__init__(*args, **kwargs)
592
593 def decode(self, s):
594 if self.transform_source:
595 s = self.transform_source(s)
596 if self.ignore_extra:
597 return self.raw_decode(s.lstrip())[0]
598 return super().decode(s)
599
600
601 def sanitize_open(filename, open_mode):
602 """Try to open the given filename, and slightly tweak it if this fails.
603
604 Attempts to open the given filename. If this fails, it tries to change
605 the filename slightly, step by step, until it's either able to open it
606 or it fails and raises a final exception, like the standard open()
607 function.
608
609 It returns the tuple (stream, definitive_file_name).
610 """
611 if filename == '-':
612 if sys.platform == 'win32':
613 import msvcrt
614 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
615 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
616
617 for attempt in range(2):
618 try:
619 try:
620 if sys.platform == 'win32':
621 # FIXME: An exclusive lock also locks the file from being read.
622 # Since windows locks are mandatory, don't lock the file on windows (for now).
623 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
624 raise LockingUnsupportedError()
625 stream = locked_file(filename, open_mode, block=False).__enter__()
626 except OSError:
627 stream = open(filename, open_mode)
628 return stream, filename
629 except OSError as err:
630 if attempt or err.errno in (errno.EACCES,):
631 raise
632 old_filename, filename = filename, sanitize_path(filename)
633 if old_filename == filename:
634 raise
635
636
637 def timeconvert(timestr):
638 """Convert RFC 2822 defined time string into system timestamp"""
639 timestamp = None
640 timetuple = email.utils.parsedate_tz(timestr)
641 if timetuple is not None:
642 timestamp = email.utils.mktime_tz(timetuple)
643 return timestamp
644
645
646 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
647 """Sanitizes a string so it could be used as part of a filename.
648 @param restricted Use a stricter subset of allowed characters
649 @param is_id Whether this is an ID that should be kept unchanged if possible.
650 If unset, yt-dlp's new sanitization rules are in effect
651 """
652 if s == '':
653 return ''
654
655 def replace_insane(char):
656 if restricted and char in ACCENT_CHARS:
657 return ACCENT_CHARS[char]
658 elif not restricted and char == '\n':
659 return '\0 '
660 elif char == '?' or ord(char) < 32 or ord(char) == 127:
661 return ''
662 elif char == '"':
663 return '' if restricted else '\''
664 elif char == ':':
665 return '\0_\0-' if restricted else '\0 \0-'
666 elif char in '\\/|*<>':
667 return '\0_'
668 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
669 return '\0_'
670 return char
671
672 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
673 result = ''.join(map(replace_insane, s))
674 if is_id is NO_DEFAULT:
675 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
676 STRIP_RE = '(?:\0.|[ _-])*'
677 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
678 result = result.replace('\0', '') or '_'
679
680 if not is_id:
681 while '__' in result:
682 result = result.replace('__', '_')
683 result = result.strip('_')
684 # Common case of "Foreign band name - English song title"
685 if restricted and result.startswith('-_'):
686 result = result[2:]
687 if result.startswith('-'):
688 result = '_' + result[len('-'):]
689 result = result.lstrip('.')
690 if not result:
691 result = '_'
692 return result
693
694
695 def sanitize_path(s, force=False):
696 """Sanitizes and normalizes path on Windows"""
697 if sys.platform == 'win32':
698 force = False
699 drive_or_unc, _ = os.path.splitdrive(s)
700 elif force:
701 drive_or_unc = ''
702 else:
703 return s
704
705 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
706 if drive_or_unc:
707 norm_path.pop(0)
708 sanitized_path = [
709 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
710 for path_part in norm_path]
711 if drive_or_unc:
712 sanitized_path.insert(0, drive_or_unc + os.path.sep)
713 elif force and s and s[0] == os.path.sep:
714 sanitized_path.insert(0, os.path.sep)
715 return os.path.join(*sanitized_path)
716
717
718 def sanitize_url(url):
719 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
720 # the number of unwanted failures due to missing protocol
721 if url is None:
722 return
723 elif url.startswith('//'):
724 return 'http:%s' % url
725 # Fix some common typos seen so far
726 COMMON_TYPOS = (
727 # https://github.com/ytdl-org/youtube-dl/issues/15649
728 (r'^httpss://', r'https://'),
729 # https://bx1.be/lives/direct-tv/
730 (r'^rmtp([es]?)://', r'rtmp\1://'),
731 )
732 for mistake, fixup in COMMON_TYPOS:
733 if re.match(mistake, url):
734 return re.sub(mistake, fixup, url)
735 return url
736
737
738 def extract_basic_auth(url):
739 parts = urllib.parse.urlsplit(url)
740 if parts.username is None:
741 return url, None
742 url = urllib.parse.urlunsplit(parts._replace(netloc=(
743 parts.hostname if parts.port is None
744 else '%s:%d' % (parts.hostname, parts.port))))
745 auth_payload = base64.b64encode(
746 ('%s:%s' % (parts.username, parts.password or '')).encode())
747 return url, f'Basic {auth_payload.decode()}'
748
749
750 def sanitized_Request(url, *args, **kwargs):
751 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
752 if auth_header is not None:
753 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
754 headers['Authorization'] = auth_header
755 return urllib.request.Request(url, *args, **kwargs)
756
757
758 def expand_path(s):
759 """Expand shell variables and ~"""
760 return os.path.expandvars(compat_expanduser(s))
761
762
763 def orderedSet(iterable, *, lazy=False):
764 """Remove all duplicates from the input iterable"""
765 def _iter():
766 seen = [] # Do not use set since the items can be unhashable
767 for x in iterable:
768 if x not in seen:
769 seen.append(x)
770 yield x
771
772 return _iter() if lazy else list(_iter())
773
774
775 def _htmlentity_transform(entity_with_semicolon):
776 """Transforms an HTML entity to a character."""
777 entity = entity_with_semicolon[:-1]
778
779 # Known non-numeric HTML entity
780 if entity in html.entities.name2codepoint:
781 return chr(html.entities.name2codepoint[entity])
782
783 # TODO: HTML5 allows entities without a semicolon. For example,
784 # '&Eacuteric' should be decoded as 'Éric'.
785 if entity_with_semicolon in html.entities.html5:
786 return html.entities.html5[entity_with_semicolon]
787
788 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
789 if mobj is not None:
790 numstr = mobj.group(1)
791 if numstr.startswith('x'):
792 base = 16
793 numstr = '0%s' % numstr
794 else:
795 base = 10
796 # See https://github.com/ytdl-org/youtube-dl/issues/7518
797 with contextlib.suppress(ValueError):
798 return chr(int(numstr, base))
799
800 # Unknown entity in name, return its literal representation
801 return '&%s;' % entity
802
803
804 def unescapeHTML(s):
805 if s is None:
806 return None
807 assert isinstance(s, str)
808
809 return re.sub(
810 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
811
812
813 def escapeHTML(text):
814 return (
815 text
816 .replace('&', '&amp;')
817 .replace('<', '&lt;')
818 .replace('>', '&gt;')
819 .replace('"', '&quot;')
820 .replace("'", '&#39;')
821 )
822
823
824 def process_communicate_or_kill(p, *args, **kwargs):
825 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
826 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
827 return Popen.communicate_or_kill(p, *args, **kwargs)
828
829
830 class Popen(subprocess.Popen):
831 if sys.platform == 'win32':
832 _startupinfo = subprocess.STARTUPINFO()
833 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
834 else:
835 _startupinfo = None
836
837 def __init__(self, *args, text=False, **kwargs):
838 if text is True:
839 kwargs['universal_newlines'] = True # For 3.6 compatibility
840 kwargs.setdefault('encoding', 'utf-8')
841 kwargs.setdefault('errors', 'replace')
842 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
843
844 def communicate_or_kill(self, *args, **kwargs):
845 try:
846 return self.communicate(*args, **kwargs)
847 except BaseException: # Including KeyboardInterrupt
848 self.kill(timeout=None)
849 raise
850
851 def kill(self, *, timeout=0):
852 super().kill()
853 if timeout != 0:
854 self.wait(timeout=timeout)
855
856 @classmethod
857 def run(cls, *args, **kwargs):
858 with cls(*args, **kwargs) as proc:
859 stdout, stderr = proc.communicate_or_kill()
860 return stdout or '', stderr or '', proc.returncode
861
862
863 def get_subprocess_encoding():
864 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
865 # For subprocess calls, encode with locale encoding
866 # Refer to http://stackoverflow.com/a/9951851/35070
867 encoding = preferredencoding()
868 else:
869 encoding = sys.getfilesystemencoding()
870 if encoding is None:
871 encoding = 'utf-8'
872 return encoding
873
874
875 def encodeFilename(s, for_subprocess=False):
876 assert isinstance(s, str)
877 return s
878
879
880 def decodeFilename(b, for_subprocess=False):
881 return b
882
883
884 def encodeArgument(s):
885 # Legacy code that uses byte strings
886 # Uncomment the following line after fixing all post processors
887 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
888 return s if isinstance(s, str) else s.decode('ascii')
889
890
891 def decodeArgument(b):
892 return b
893
894
895 def decodeOption(optval):
896 if optval is None:
897 return optval
898 if isinstance(optval, bytes):
899 optval = optval.decode(preferredencoding())
900
901 assert isinstance(optval, str)
902 return optval
903
904
905 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
906
907
908 def timetuple_from_msec(msec):
909 secs, msec = divmod(msec, 1000)
910 mins, secs = divmod(secs, 60)
911 hrs, mins = divmod(mins, 60)
912 return _timetuple(hrs, mins, secs, msec)
913
914
915 def formatSeconds(secs, delim=':', msec=False):
916 time = timetuple_from_msec(secs * 1000)
917 if time.hours:
918 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
919 elif time.minutes:
920 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
921 else:
922 ret = '%d' % time.seconds
923 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
924
925
926 def _ssl_load_windows_store_certs(ssl_context, storename):
927 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
928 try:
929 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
930 if encoding == 'x509_asn' and (
931 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
932 except PermissionError:
933 return
934 for cert in certs:
935 with contextlib.suppress(ssl.SSLError):
936 ssl_context.load_verify_locations(cadata=cert)
937
938
939 def make_HTTPS_handler(params, **kwargs):
940 opts_check_certificate = not params.get('nocheckcertificate')
941 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
942 context.check_hostname = opts_check_certificate
943 if params.get('legacyserverconnect'):
944 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
945 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
946 context.set_ciphers('DEFAULT')
947
948 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
949 if opts_check_certificate:
950 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
951 context.load_verify_locations(cafile=certifi.where())
952 try:
953 context.load_default_certs()
954 # Work around the issue in load_default_certs when there are bad certificates. See:
955 # https://github.com/yt-dlp/yt-dlp/issues/1060,
956 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
957 except ssl.SSLError:
958 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
959 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
960 for storename in ('CA', 'ROOT'):
961 _ssl_load_windows_store_certs(context, storename)
962 context.set_default_verify_paths()
963
964 client_certfile = params.get('client_certificate')
965 if client_certfile:
966 try:
967 context.load_cert_chain(
968 client_certfile, keyfile=params.get('client_certificate_key'),
969 password=params.get('client_certificate_password'))
970 except ssl.SSLError:
971 raise YoutubeDLError('Unable to load client certificate')
972
973 # Some servers may reject requests if ALPN extension is not sent. See:
974 # https://github.com/python/cpython/issues/85140
975 # https://github.com/yt-dlp/yt-dlp/issues/3878
976 with contextlib.suppress(NotImplementedError):
977 context.set_alpn_protocols(['http/1.1'])
978
979 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
980
981
982 def bug_reports_message(before=';'):
983 from .update import REPOSITORY
984
985 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
986 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
987
988 before = before.rstrip()
989 if not before or before.endswith(('.', '!', '?')):
990 msg = msg[0].title() + msg[1:]
991
992 return (before + ' ' if before else '') + msg
993
994
995 class YoutubeDLError(Exception):
996 """Base exception for YoutubeDL errors."""
997 msg = None
998
999 def __init__(self, msg=None):
1000 if msg is not None:
1001 self.msg = msg
1002 elif self.msg is None:
1003 self.msg = type(self).__name__
1004 super().__init__(self.msg)
1005
1006
1007 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1008 if hasattr(ssl, 'CertificateError'):
1009 network_exceptions.append(ssl.CertificateError)
1010 network_exceptions = tuple(network_exceptions)
1011
1012
1013 class ExtractorError(YoutubeDLError):
1014 """Error during info extraction."""
1015
1016 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1017 """ tb, if given, is the original traceback (so that it can be printed out).
1018 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1019 """
1020 if sys.exc_info()[0] in network_exceptions:
1021 expected = True
1022
1023 self.orig_msg = str(msg)
1024 self.traceback = tb
1025 self.expected = expected
1026 self.cause = cause
1027 self.video_id = video_id
1028 self.ie = ie
1029 self.exc_info = sys.exc_info() # preserve original exception
1030 if isinstance(self.exc_info[1], ExtractorError):
1031 self.exc_info = self.exc_info[1].exc_info
1032
1033 super().__init__(''.join((
1034 format_field(ie, None, '[%s] '),
1035 format_field(video_id, None, '%s: '),
1036 msg,
1037 format_field(cause, None, ' (caused by %r)'),
1038 '' if expected else bug_reports_message())))
1039
1040 def format_traceback(self):
1041 return join_nonempty(
1042 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1043 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1044 delim='\n') or None
1045
1046
1047 class UnsupportedError(ExtractorError):
1048 def __init__(self, url):
1049 super().__init__(
1050 'Unsupported URL: %s' % url, expected=True)
1051 self.url = url
1052
1053
1054 class RegexNotFoundError(ExtractorError):
1055 """Error when a regex didn't match"""
1056 pass
1057
1058
1059 class GeoRestrictedError(ExtractorError):
1060 """Geographic restriction Error exception.
1061
1062 This exception may be thrown when a video is not available from your
1063 geographic location due to geographic restrictions imposed by a website.
1064 """
1065
1066 def __init__(self, msg, countries=None, **kwargs):
1067 kwargs['expected'] = True
1068 super().__init__(msg, **kwargs)
1069 self.countries = countries
1070
1071
1072 class DownloadError(YoutubeDLError):
1073 """Download Error exception.
1074
1075 This exception may be thrown by FileDownloader objects if they are not
1076 configured to continue on errors. They will contain the appropriate
1077 error message.
1078 """
1079
1080 def __init__(self, msg, exc_info=None):
1081 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1082 super().__init__(msg)
1083 self.exc_info = exc_info
1084
1085
1086 class EntryNotInPlaylist(YoutubeDLError):
1087 """Entry not in playlist exception.
1088
1089 This exception will be thrown by YoutubeDL when a requested entry
1090 is not found in the playlist info_dict
1091 """
1092 msg = 'Entry not found in info'
1093
1094
1095 class SameFileError(YoutubeDLError):
1096 """Same File exception.
1097
1098 This exception will be thrown by FileDownloader objects if they detect
1099 multiple files would have to be downloaded to the same file on disk.
1100 """
1101 msg = 'Fixed output name but more than one file to download'
1102
1103 def __init__(self, filename=None):
1104 if filename is not None:
1105 self.msg += f': {filename}'
1106 super().__init__(self.msg)
1107
1108
1109 class PostProcessingError(YoutubeDLError):
1110 """Post Processing exception.
1111
1112 This exception may be raised by PostProcessor's .run() method to
1113 indicate an error in the postprocessing task.
1114 """
1115
1116
1117 class DownloadCancelled(YoutubeDLError):
1118 """ Exception raised when the download queue should be interrupted """
1119 msg = 'The download was cancelled'
1120
1121
1122 class ExistingVideoReached(DownloadCancelled):
1123 """ --break-on-existing triggered """
1124 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1125
1126
1127 class RejectedVideoReached(DownloadCancelled):
1128 """ --break-on-reject triggered """
1129 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1130
1131
1132 class MaxDownloadsReached(DownloadCancelled):
1133 """ --max-downloads limit has been reached. """
1134 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1135
1136
1137 class ReExtractInfo(YoutubeDLError):
1138 """ Video info needs to be re-extracted. """
1139
1140 def __init__(self, msg, expected=False):
1141 super().__init__(msg)
1142 self.expected = expected
1143
1144
1145 class ThrottledDownload(ReExtractInfo):
1146 """ Download speed below --throttled-rate. """
1147 msg = 'The download speed is below throttle limit'
1148
1149 def __init__(self):
1150 super().__init__(self.msg, expected=False)
1151
1152
1153 class UnavailableVideoError(YoutubeDLError):
1154 """Unavailable Format exception.
1155
1156 This exception will be thrown when a video is requested
1157 in a format that is not available for that video.
1158 """
1159 msg = 'Unable to download video'
1160
1161 def __init__(self, err=None):
1162 if err is not None:
1163 self.msg += f': {err}'
1164 super().__init__(self.msg)
1165
1166
1167 class ContentTooShortError(YoutubeDLError):
1168 """Content Too Short exception.
1169
1170 This exception may be raised by FileDownloader objects when a file they
1171 download is too small for what the server announced first, indicating
1172 the connection was probably interrupted.
1173 """
1174
1175 def __init__(self, downloaded, expected):
1176 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1177 # Both in bytes
1178 self.downloaded = downloaded
1179 self.expected = expected
1180
1181
1182 class XAttrMetadataError(YoutubeDLError):
1183 def __init__(self, code=None, msg='Unknown error'):
1184 super().__init__(msg)
1185 self.code = code
1186 self.msg = msg
1187
1188 # Parsing code and msg
1189 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1190 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1191 self.reason = 'NO_SPACE'
1192 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1193 self.reason = 'VALUE_TOO_LONG'
1194 else:
1195 self.reason = 'NOT_SUPPORTED'
1196
1197
1198 class XAttrUnavailableError(YoutubeDLError):
1199 pass
1200
1201
1202 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1203 hc = http_class(*args, **kwargs)
1204 source_address = ydl_handler._params.get('source_address')
1205
1206 if source_address is not None:
1207 # This is to workaround _create_connection() from socket where it will try all
1208 # address data from getaddrinfo() including IPv6. This filters the result from
1209 # getaddrinfo() based on the source_address value.
1210 # This is based on the cpython socket.create_connection() function.
1211 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1212 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1213 host, port = address
1214 err = None
1215 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1216 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1217 ip_addrs = [addr for addr in addrs if addr[0] == af]
1218 if addrs and not ip_addrs:
1219 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1220 raise OSError(
1221 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1222 % (ip_version, source_address[0]))
1223 for res in ip_addrs:
1224 af, socktype, proto, canonname, sa = res
1225 sock = None
1226 try:
1227 sock = socket.socket(af, socktype, proto)
1228 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1229 sock.settimeout(timeout)
1230 sock.bind(source_address)
1231 sock.connect(sa)
1232 err = None # Explicitly break reference cycle
1233 return sock
1234 except OSError as _:
1235 err = _
1236 if sock is not None:
1237 sock.close()
1238 if err is not None:
1239 raise err
1240 else:
1241 raise OSError('getaddrinfo returns an empty list')
1242 if hasattr(hc, '_create_connection'):
1243 hc._create_connection = _create_connection
1244 hc.source_address = (source_address, 0)
1245
1246 return hc
1247
1248
1249 def handle_youtubedl_headers(headers):
1250 filtered_headers = headers
1251
1252 if 'Youtubedl-no-compression' in filtered_headers:
1253 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1254 del filtered_headers['Youtubedl-no-compression']
1255
1256 return filtered_headers
1257
1258
1259 class YoutubeDLHandler(urllib.request.HTTPHandler):
1260 """Handler for HTTP requests and responses.
1261
1262 This class, when installed with an OpenerDirector, automatically adds
1263 the standard headers to every HTTP request and handles gzipped and
1264 deflated responses from web servers. If compression is to be avoided in
1265 a particular request, the original request in the program code only has
1266 to include the HTTP header "Youtubedl-no-compression", which will be
1267 removed before making the real request.
1268
1269 Part of this code was copied from:
1270
1271 http://techknack.net/python-urllib2-handlers/
1272
1273 Andrew Rowls, the author of that code, agreed to release it to the
1274 public domain.
1275 """
1276
1277 def __init__(self, params, *args, **kwargs):
1278 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1279 self._params = params
1280
1281 def http_open(self, req):
1282 conn_class = http.client.HTTPConnection
1283
1284 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1285 if socks_proxy:
1286 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1287 del req.headers['Ytdl-socks-proxy']
1288
1289 return self.do_open(functools.partial(
1290 _create_http_connection, self, conn_class, False),
1291 req)
1292
1293 @staticmethod
1294 def deflate(data):
1295 if not data:
1296 return data
1297 try:
1298 return zlib.decompress(data, -zlib.MAX_WBITS)
1299 except zlib.error:
1300 return zlib.decompress(data)
1301
1302 @staticmethod
1303 def brotli(data):
1304 if not data:
1305 return data
1306 return brotli.decompress(data)
1307
1308 def http_request(self, req):
1309 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1310 # always respected by websites, some tend to give out URLs with non percent-encoded
1311 # non-ASCII characters (see telemb.py, ard.py [#3412])
1312 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1313 # To work around aforementioned issue we will replace request's original URL with
1314 # percent-encoded one
1315 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1316 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1317 url = req.get_full_url()
1318 url_escaped = escape_url(url)
1319
1320 # Substitute URL if any change after escaping
1321 if url != url_escaped:
1322 req = update_Request(req, url=url_escaped)
1323
1324 for h, v in self._params.get('http_headers', std_headers).items():
1325 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1326 # The dict keys are capitalized because of this bug by urllib
1327 if h.capitalize() not in req.headers:
1328 req.add_header(h, v)
1329
1330 if 'Accept-encoding' not in req.headers:
1331 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1332
1333 req.headers = handle_youtubedl_headers(req.headers)
1334
1335 return super().do_request_(req)
1336
1337 def http_response(self, req, resp):
1338 old_resp = resp
1339 # gzip
1340 if resp.headers.get('Content-encoding', '') == 'gzip':
1341 content = resp.read()
1342 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1343 try:
1344 uncompressed = io.BytesIO(gz.read())
1345 except OSError as original_ioerror:
1346 # There may be junk add the end of the file
1347 # See http://stackoverflow.com/q/4928560/35070 for details
1348 for i in range(1, 1024):
1349 try:
1350 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1351 uncompressed = io.BytesIO(gz.read())
1352 except OSError:
1353 continue
1354 break
1355 else:
1356 raise original_ioerror
1357 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1358 resp.msg = old_resp.msg
1359 del resp.headers['Content-encoding']
1360 # deflate
1361 if resp.headers.get('Content-encoding', '') == 'deflate':
1362 gz = io.BytesIO(self.deflate(resp.read()))
1363 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1364 resp.msg = old_resp.msg
1365 del resp.headers['Content-encoding']
1366 # brotli
1367 if resp.headers.get('Content-encoding', '') == 'br':
1368 resp = urllib.request.addinfourl(
1369 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1370 resp.msg = old_resp.msg
1371 del resp.headers['Content-encoding']
1372 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1373 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1374 if 300 <= resp.code < 400:
1375 location = resp.headers.get('Location')
1376 if location:
1377 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1378 location = location.encode('iso-8859-1').decode()
1379 location_escaped = escape_url(location)
1380 if location != location_escaped:
1381 del resp.headers['Location']
1382 resp.headers['Location'] = location_escaped
1383 return resp
1384
1385 https_request = http_request
1386 https_response = http_response
1387
1388
1389 def make_socks_conn_class(base_class, socks_proxy):
1390 assert issubclass(base_class, (
1391 http.client.HTTPConnection, http.client.HTTPSConnection))
1392
1393 url_components = urllib.parse.urlparse(socks_proxy)
1394 if url_components.scheme.lower() == 'socks5':
1395 socks_type = ProxyType.SOCKS5
1396 elif url_components.scheme.lower() in ('socks', 'socks4'):
1397 socks_type = ProxyType.SOCKS4
1398 elif url_components.scheme.lower() == 'socks4a':
1399 socks_type = ProxyType.SOCKS4A
1400
1401 def unquote_if_non_empty(s):
1402 if not s:
1403 return s
1404 return urllib.parse.unquote_plus(s)
1405
1406 proxy_args = (
1407 socks_type,
1408 url_components.hostname, url_components.port or 1080,
1409 True, # Remote DNS
1410 unquote_if_non_empty(url_components.username),
1411 unquote_if_non_empty(url_components.password),
1412 )
1413
1414 class SocksConnection(base_class):
1415 def connect(self):
1416 self.sock = sockssocket()
1417 self.sock.setproxy(*proxy_args)
1418 if isinstance(self.timeout, (int, float)):
1419 self.sock.settimeout(self.timeout)
1420 self.sock.connect((self.host, self.port))
1421
1422 if isinstance(self, http.client.HTTPSConnection):
1423 if hasattr(self, '_context'): # Python > 2.6
1424 self.sock = self._context.wrap_socket(
1425 self.sock, server_hostname=self.host)
1426 else:
1427 self.sock = ssl.wrap_socket(self.sock)
1428
1429 return SocksConnection
1430
1431
1432 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1433 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1434 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1435 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1436 self._params = params
1437
1438 def https_open(self, req):
1439 kwargs = {}
1440 conn_class = self._https_conn_class
1441
1442 if hasattr(self, '_context'): # python > 2.6
1443 kwargs['context'] = self._context
1444 if hasattr(self, '_check_hostname'): # python 3.x
1445 kwargs['check_hostname'] = self._check_hostname
1446
1447 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1448 if socks_proxy:
1449 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1450 del req.headers['Ytdl-socks-proxy']
1451
1452 try:
1453 return self.do_open(
1454 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1455 except urllib.error.URLError as e:
1456 if (isinstance(e.reason, ssl.SSLError)
1457 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1458 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1459 raise
1460
1461
1462 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1463 """
1464 See [1] for cookie file format.
1465
1466 1. https://curl.haxx.se/docs/http-cookies.html
1467 """
1468 _HTTPONLY_PREFIX = '#HttpOnly_'
1469 _ENTRY_LEN = 7
1470 _HEADER = '''# Netscape HTTP Cookie File
1471 # This file is generated by yt-dlp. Do not edit.
1472
1473 '''
1474 _CookieFileEntry = collections.namedtuple(
1475 'CookieFileEntry',
1476 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1477
1478 def __init__(self, filename=None, *args, **kwargs):
1479 super().__init__(None, *args, **kwargs)
1480 if self.is_path(filename):
1481 filename = os.fspath(filename)
1482 self.filename = filename
1483
1484 @staticmethod
1485 def _true_or_false(cndn):
1486 return 'TRUE' if cndn else 'FALSE'
1487
1488 @staticmethod
1489 def is_path(file):
1490 return isinstance(file, (str, bytes, os.PathLike))
1491
1492 @contextlib.contextmanager
1493 def open(self, file, *, write=False):
1494 if self.is_path(file):
1495 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1496 yield f
1497 else:
1498 if write:
1499 file.truncate(0)
1500 yield file
1501
1502 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1503 now = time.time()
1504 for cookie in self:
1505 if (not ignore_discard and cookie.discard
1506 or not ignore_expires and cookie.is_expired(now)):
1507 continue
1508 name, value = cookie.name, cookie.value
1509 if value is None:
1510 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1511 # with no name, whereas http.cookiejar regards it as a
1512 # cookie with no value.
1513 name, value = '', name
1514 f.write('%s\n' % '\t'.join((
1515 cookie.domain,
1516 self._true_or_false(cookie.domain.startswith('.')),
1517 cookie.path,
1518 self._true_or_false(cookie.secure),
1519 str_or_none(cookie.expires, default=''),
1520 name, value
1521 )))
1522
1523 def save(self, filename=None, *args, **kwargs):
1524 """
1525 Save cookies to a file.
1526 Code is taken from CPython 3.6
1527 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1528
1529 if filename is None:
1530 if self.filename is not None:
1531 filename = self.filename
1532 else:
1533 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1534
1535 # Store session cookies with `expires` set to 0 instead of an empty string
1536 for cookie in self:
1537 if cookie.expires is None:
1538 cookie.expires = 0
1539
1540 with self.open(filename, write=True) as f:
1541 f.write(self._HEADER)
1542 self._really_save(f, *args, **kwargs)
1543
1544 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1545 """Load cookies from a file."""
1546 if filename is None:
1547 if self.filename is not None:
1548 filename = self.filename
1549 else:
1550 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1551
1552 def prepare_line(line):
1553 if line.startswith(self._HTTPONLY_PREFIX):
1554 line = line[len(self._HTTPONLY_PREFIX):]
1555 # comments and empty lines are fine
1556 if line.startswith('#') or not line.strip():
1557 return line
1558 cookie_list = line.split('\t')
1559 if len(cookie_list) != self._ENTRY_LEN:
1560 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1561 cookie = self._CookieFileEntry(*cookie_list)
1562 if cookie.expires_at and not cookie.expires_at.isdigit():
1563 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1564 return line
1565
1566 cf = io.StringIO()
1567 with self.open(filename) as f:
1568 for line in f:
1569 try:
1570 cf.write(prepare_line(line))
1571 except http.cookiejar.LoadError as e:
1572 if f'{line.strip()} '[0] in '[{"':
1573 raise http.cookiejar.LoadError(
1574 'Cookies file must be Netscape formatted, not JSON. See '
1575 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1576 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1577 continue
1578 cf.seek(0)
1579 self._really_load(cf, filename, ignore_discard, ignore_expires)
1580 # Session cookies are denoted by either `expires` field set to
1581 # an empty string or 0. MozillaCookieJar only recognizes the former
1582 # (see [1]). So we need force the latter to be recognized as session
1583 # cookies on our own.
1584 # Session cookies may be important for cookies-based authentication,
1585 # e.g. usually, when user does not check 'Remember me' check box while
1586 # logging in on a site, some important cookies are stored as session
1587 # cookies so that not recognizing them will result in failed login.
1588 # 1. https://bugs.python.org/issue17164
1589 for cookie in self:
1590 # Treat `expires=0` cookies as session cookies
1591 if cookie.expires == 0:
1592 cookie.expires = None
1593 cookie.discard = True
1594
1595
1596 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1597 def __init__(self, cookiejar=None):
1598 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1599
1600 def http_response(self, request, response):
1601 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1602
1603 https_request = urllib.request.HTTPCookieProcessor.http_request
1604 https_response = http_response
1605
1606
1607 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1608 """YoutubeDL redirect handler
1609
1610 The code is based on HTTPRedirectHandler implementation from CPython [1].
1611
1612 This redirect handler solves two issues:
1613 - ensures redirect URL is always unicode under python 2
1614 - introduces support for experimental HTTP response status code
1615 308 Permanent Redirect [2] used by some sites [3]
1616
1617 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1618 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1619 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1620 """
1621
1622 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1623
1624 def redirect_request(self, req, fp, code, msg, headers, newurl):
1625 """Return a Request or None in response to a redirect.
1626
1627 This is called by the http_error_30x methods when a
1628 redirection response is received. If a redirection should
1629 take place, return a new Request to allow http_error_30x to
1630 perform the redirect. Otherwise, raise HTTPError if no-one
1631 else should try to handle this url. Return None if you can't
1632 but another Handler might.
1633 """
1634 m = req.get_method()
1635 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1636 or code in (301, 302, 303) and m == "POST")):
1637 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1638 # Strictly (according to RFC 2616), 301 or 302 in response to
1639 # a POST MUST NOT cause a redirection without confirmation
1640 # from the user (of urllib.request, in this case). In practice,
1641 # essentially all clients do redirect in this case, so we do
1642 # the same.
1643
1644 # Be conciliant with URIs containing a space. This is mainly
1645 # redundant with the more complete encoding done in http_error_302(),
1646 # but it is kept for compatibility with other callers.
1647 newurl = newurl.replace(' ', '%20')
1648
1649 CONTENT_HEADERS = ("content-length", "content-type")
1650 # NB: don't use dict comprehension for python 2.6 compatibility
1651 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1652
1653 # A 303 must either use GET or HEAD for subsequent request
1654 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1655 if code == 303 and m != 'HEAD':
1656 m = 'GET'
1657 # 301 and 302 redirects are commonly turned into a GET from a POST
1658 # for subsequent requests by browsers, so we'll do the same.
1659 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1660 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1661 if code in (301, 302) and m == 'POST':
1662 m = 'GET'
1663
1664 return urllib.request.Request(
1665 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1666 unverifiable=True, method=m)
1667
1668
1669 def extract_timezone(date_str):
1670 m = re.search(
1671 r'''(?x)
1672 ^.{8,}? # >=8 char non-TZ prefix, if present
1673 (?P<tz>Z| # just the UTC Z, or
1674 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1675 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1676 [ ]? # optional space
1677 (?P<sign>\+|-) # +/-
1678 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1679 $)
1680 ''', date_str)
1681 if not m:
1682 timezone = datetime.timedelta()
1683 else:
1684 date_str = date_str[:-len(m.group('tz'))]
1685 if not m.group('sign'):
1686 timezone = datetime.timedelta()
1687 else:
1688 sign = 1 if m.group('sign') == '+' else -1
1689 timezone = datetime.timedelta(
1690 hours=sign * int(m.group('hours')),
1691 minutes=sign * int(m.group('minutes')))
1692 return timezone, date_str
1693
1694
1695 def parse_iso8601(date_str, delimiter='T', timezone=None):
1696 """ Return a UNIX timestamp from the given date """
1697
1698 if date_str is None:
1699 return None
1700
1701 date_str = re.sub(r'\.[0-9]+', '', date_str)
1702
1703 if timezone is None:
1704 timezone, date_str = extract_timezone(date_str)
1705
1706 with contextlib.suppress(ValueError):
1707 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1708 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1709 return calendar.timegm(dt.timetuple())
1710
1711
1712 def date_formats(day_first=True):
1713 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1714
1715
1716 def unified_strdate(date_str, day_first=True):
1717 """Return a string with the date in the format YYYYMMDD"""
1718
1719 if date_str is None:
1720 return None
1721 upload_date = None
1722 # Replace commas
1723 date_str = date_str.replace(',', ' ')
1724 # Remove AM/PM + timezone
1725 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1726 _, date_str = extract_timezone(date_str)
1727
1728 for expression in date_formats(day_first):
1729 with contextlib.suppress(ValueError):
1730 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1731 if upload_date is None:
1732 timetuple = email.utils.parsedate_tz(date_str)
1733 if timetuple:
1734 with contextlib.suppress(ValueError):
1735 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1736 if upload_date is not None:
1737 return str(upload_date)
1738
1739
1740 def unified_timestamp(date_str, day_first=True):
1741 if date_str is None:
1742 return None
1743
1744 date_str = re.sub(r'[,|]', '', date_str)
1745
1746 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1747 timezone, date_str = extract_timezone(date_str)
1748
1749 # Remove AM/PM + timezone
1750 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1751
1752 # Remove unrecognized timezones from ISO 8601 alike timestamps
1753 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1754 if m:
1755 date_str = date_str[:-len(m.group('tz'))]
1756
1757 # Python only supports microseconds, so remove nanoseconds
1758 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1759 if m:
1760 date_str = m.group(1)
1761
1762 for expression in date_formats(day_first):
1763 with contextlib.suppress(ValueError):
1764 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1765 return calendar.timegm(dt.timetuple())
1766 timetuple = email.utils.parsedate_tz(date_str)
1767 if timetuple:
1768 return calendar.timegm(timetuple) + pm_delta * 3600
1769
1770
1771 def determine_ext(url, default_ext='unknown_video'):
1772 if url is None or '.' not in url:
1773 return default_ext
1774 guess = url.partition('?')[0].rpartition('.')[2]
1775 if re.match(r'^[A-Za-z0-9]+$', guess):
1776 return guess
1777 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1778 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1779 return guess.rstrip('/')
1780 else:
1781 return default_ext
1782
1783
1784 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1785 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1786
1787
1788 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1789 R"""
1790 Return a datetime object from a string.
1791 Supported format:
1792 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1793
1794 @param format strftime format of DATE
1795 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1796 auto: round to the unit provided in date_str (if applicable).
1797 """
1798 auto_precision = False
1799 if precision == 'auto':
1800 auto_precision = True
1801 precision = 'microsecond'
1802 today = datetime_round(datetime.datetime.utcnow(), precision)
1803 if date_str in ('now', 'today'):
1804 return today
1805 if date_str == 'yesterday':
1806 return today - datetime.timedelta(days=1)
1807 match = re.match(
1808 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1809 date_str)
1810 if match is not None:
1811 start_time = datetime_from_str(match.group('start'), precision, format)
1812 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1813 unit = match.group('unit')
1814 if unit == 'month' or unit == 'year':
1815 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1816 unit = 'day'
1817 else:
1818 if unit == 'week':
1819 unit = 'day'
1820 time *= 7
1821 delta = datetime.timedelta(**{unit + 's': time})
1822 new_date = start_time + delta
1823 if auto_precision:
1824 return datetime_round(new_date, unit)
1825 return new_date
1826
1827 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1828
1829
1830 def date_from_str(date_str, format='%Y%m%d', strict=False):
1831 R"""
1832 Return a date object from a string using datetime_from_str
1833
1834 @param strict Restrict allowed patterns to "YYYYMMDD" and
1835 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1836 """
1837 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1838 raise ValueError(f'Invalid date format "{date_str}"')
1839 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1840
1841
1842 def datetime_add_months(dt, months):
1843 """Increment/Decrement a datetime object by months."""
1844 month = dt.month + months - 1
1845 year = dt.year + month // 12
1846 month = month % 12 + 1
1847 day = min(dt.day, calendar.monthrange(year, month)[1])
1848 return dt.replace(year, month, day)
1849
1850
1851 def datetime_round(dt, precision='day'):
1852 """
1853 Round a datetime object's time to a specific precision
1854 """
1855 if precision == 'microsecond':
1856 return dt
1857
1858 unit_seconds = {
1859 'day': 86400,
1860 'hour': 3600,
1861 'minute': 60,
1862 'second': 1,
1863 }
1864 roundto = lambda x, n: ((x + n / 2) // n) * n
1865 timestamp = calendar.timegm(dt.timetuple())
1866 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1867
1868
1869 def hyphenate_date(date_str):
1870 """
1871 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1872 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1873 if match is not None:
1874 return '-'.join(match.groups())
1875 else:
1876 return date_str
1877
1878
1879 class DateRange:
1880 """Represents a time interval between two dates"""
1881
1882 def __init__(self, start=None, end=None):
1883 """start and end must be strings in the format accepted by date"""
1884 if start is not None:
1885 self.start = date_from_str(start, strict=True)
1886 else:
1887 self.start = datetime.datetime.min.date()
1888 if end is not None:
1889 self.end = date_from_str(end, strict=True)
1890 else:
1891 self.end = datetime.datetime.max.date()
1892 if self.start > self.end:
1893 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1894
1895 @classmethod
1896 def day(cls, day):
1897 """Returns a range that only contains the given day"""
1898 return cls(day, day)
1899
1900 def __contains__(self, date):
1901 """Check if the date is in the range"""
1902 if not isinstance(date, datetime.date):
1903 date = date_from_str(date)
1904 return self.start <= date <= self.end
1905
1906 def __str__(self):
1907 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1908
1909
1910 def platform_name():
1911 """ Returns the platform name as a str """
1912 res = platform.platform()
1913 if isinstance(res, bytes):
1914 res = res.decode(preferredencoding())
1915
1916 assert isinstance(res, str)
1917 return res
1918
1919
1920 @functools.cache
1921 def get_windows_version():
1922 ''' Get Windows version. returns () if it's not running on Windows '''
1923 if compat_os_name == 'nt':
1924 return version_tuple(platform.win32_ver()[1])
1925 else:
1926 return ()
1927
1928
1929 def write_string(s, out=None, encoding=None):
1930 assert isinstance(s, str)
1931 out = out or sys.stderr
1932
1933 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1934 s = re.sub(r'([\r\n]+)', r' \1', s)
1935
1936 enc, buffer = None, out
1937 if 'b' in getattr(out, 'mode', ''):
1938 enc = encoding or preferredencoding()
1939 elif hasattr(out, 'buffer'):
1940 buffer = out.buffer
1941 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1942
1943 buffer.write(s.encode(enc, 'ignore') if enc else s)
1944 out.flush()
1945
1946
1947 def bytes_to_intlist(bs):
1948 if not bs:
1949 return []
1950 if isinstance(bs[0], int): # Python 3
1951 return list(bs)
1952 else:
1953 return [ord(c) for c in bs]
1954
1955
1956 def intlist_to_bytes(xs):
1957 if not xs:
1958 return b''
1959 return struct.pack('%dB' % len(xs), *xs)
1960
1961
1962 class LockingUnsupportedError(OSError):
1963 msg = 'File locking is not supported'
1964
1965 def __init__(self):
1966 super().__init__(self.msg)
1967
1968
1969 # Cross-platform file locking
1970 if sys.platform == 'win32':
1971 import ctypes.wintypes
1972 import msvcrt
1973
1974 class OVERLAPPED(ctypes.Structure):
1975 _fields_ = [
1976 ('Internal', ctypes.wintypes.LPVOID),
1977 ('InternalHigh', ctypes.wintypes.LPVOID),
1978 ('Offset', ctypes.wintypes.DWORD),
1979 ('OffsetHigh', ctypes.wintypes.DWORD),
1980 ('hEvent', ctypes.wintypes.HANDLE),
1981 ]
1982
1983 kernel32 = ctypes.windll.kernel32
1984 LockFileEx = kernel32.LockFileEx
1985 LockFileEx.argtypes = [
1986 ctypes.wintypes.HANDLE, # hFile
1987 ctypes.wintypes.DWORD, # dwFlags
1988 ctypes.wintypes.DWORD, # dwReserved
1989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1991 ctypes.POINTER(OVERLAPPED) # Overlapped
1992 ]
1993 LockFileEx.restype = ctypes.wintypes.BOOL
1994 UnlockFileEx = kernel32.UnlockFileEx
1995 UnlockFileEx.argtypes = [
1996 ctypes.wintypes.HANDLE, # hFile
1997 ctypes.wintypes.DWORD, # dwReserved
1998 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1999 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2000 ctypes.POINTER(OVERLAPPED) # Overlapped
2001 ]
2002 UnlockFileEx.restype = ctypes.wintypes.BOOL
2003 whole_low = 0xffffffff
2004 whole_high = 0x7fffffff
2005
2006 def _lock_file(f, exclusive, block):
2007 overlapped = OVERLAPPED()
2008 overlapped.Offset = 0
2009 overlapped.OffsetHigh = 0
2010 overlapped.hEvent = 0
2011 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2012
2013 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2014 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2015 0, whole_low, whole_high, f._lock_file_overlapped_p):
2016 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2017 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2018
2019 def _unlock_file(f):
2020 assert f._lock_file_overlapped_p
2021 handle = msvcrt.get_osfhandle(f.fileno())
2022 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2023 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2024
2025 else:
2026 try:
2027 import fcntl
2028
2029 def _lock_file(f, exclusive, block):
2030 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2031 if not block:
2032 flags |= fcntl.LOCK_NB
2033 try:
2034 fcntl.flock(f, flags)
2035 except BlockingIOError:
2036 raise
2037 except OSError: # AOSP does not have flock()
2038 fcntl.lockf(f, flags)
2039
2040 def _unlock_file(f):
2041 try:
2042 fcntl.flock(f, fcntl.LOCK_UN)
2043 except OSError:
2044 fcntl.lockf(f, fcntl.LOCK_UN)
2045
2046 except ImportError:
2047
2048 def _lock_file(f, exclusive, block):
2049 raise LockingUnsupportedError()
2050
2051 def _unlock_file(f):
2052 raise LockingUnsupportedError()
2053
2054
2055 class locked_file:
2056 locked = False
2057
2058 def __init__(self, filename, mode, block=True, encoding=None):
2059 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2060 raise NotImplementedError(mode)
2061 self.mode, self.block = mode, block
2062
2063 writable = any(f in mode for f in 'wax+')
2064 readable = any(f in mode for f in 'r+')
2065 flags = functools.reduce(operator.ior, (
2066 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2067 getattr(os, 'O_BINARY', 0), # Windows only
2068 getattr(os, 'O_NOINHERIT', 0), # Windows only
2069 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2070 os.O_APPEND if 'a' in mode else 0,
2071 os.O_EXCL if 'x' in mode else 0,
2072 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2073 ))
2074
2075 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2076
2077 def __enter__(self):
2078 exclusive = 'r' not in self.mode
2079 try:
2080 _lock_file(self.f, exclusive, self.block)
2081 self.locked = True
2082 except OSError:
2083 self.f.close()
2084 raise
2085 if 'w' in self.mode:
2086 try:
2087 self.f.truncate()
2088 except OSError as e:
2089 if e.errno not in (
2090 errno.ESPIPE, # Illegal seek - expected for FIFO
2091 errno.EINVAL, # Invalid argument - expected for /dev/null
2092 ):
2093 raise
2094 return self
2095
2096 def unlock(self):
2097 if not self.locked:
2098 return
2099 try:
2100 _unlock_file(self.f)
2101 finally:
2102 self.locked = False
2103
2104 def __exit__(self, *_):
2105 try:
2106 self.unlock()
2107 finally:
2108 self.f.close()
2109
2110 open = __enter__
2111 close = __exit__
2112
2113 def __getattr__(self, attr):
2114 return getattr(self.f, attr)
2115
2116 def __iter__(self):
2117 return iter(self.f)
2118
2119
2120 @functools.cache
2121 def get_filesystem_encoding():
2122 encoding = sys.getfilesystemencoding()
2123 return encoding if encoding is not None else 'utf-8'
2124
2125
2126 def shell_quote(args):
2127 quoted_args = []
2128 encoding = get_filesystem_encoding()
2129 for a in args:
2130 if isinstance(a, bytes):
2131 # We may get a filename encoded with 'encodeFilename'
2132 a = a.decode(encoding)
2133 quoted_args.append(compat_shlex_quote(a))
2134 return ' '.join(quoted_args)
2135
2136
2137 def smuggle_url(url, data):
2138 """ Pass additional data in a URL for internal use. """
2139
2140 url, idata = unsmuggle_url(url, {})
2141 data.update(idata)
2142 sdata = urllib.parse.urlencode(
2143 {'__youtubedl_smuggle': json.dumps(data)})
2144 return url + '#' + sdata
2145
2146
2147 def unsmuggle_url(smug_url, default=None):
2148 if '#__youtubedl_smuggle' not in smug_url:
2149 return smug_url, default
2150 url, _, sdata = smug_url.rpartition('#')
2151 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2152 data = json.loads(jsond)
2153 return url, data
2154
2155
2156 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2157 """ Formats numbers with decimal sufixes like K, M, etc """
2158 num, factor = float_or_none(num), float(factor)
2159 if num is None or num < 0:
2160 return None
2161 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2162 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2163 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2164 if factor == 1024:
2165 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2166 converted = num / (factor ** exponent)
2167 return fmt % (converted, suffix)
2168
2169
2170 def format_bytes(bytes):
2171 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2172
2173
2174 def lookup_unit_table(unit_table, s):
2175 units_re = '|'.join(re.escape(u) for u in unit_table)
2176 m = re.match(
2177 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2178 if not m:
2179 return None
2180 num_str = m.group('num').replace(',', '.')
2181 mult = unit_table[m.group('unit')]
2182 return int(float(num_str) * mult)
2183
2184
2185 def parse_filesize(s):
2186 if s is None:
2187 return None
2188
2189 # The lower-case forms are of course incorrect and unofficial,
2190 # but we support those too
2191 _UNIT_TABLE = {
2192 'B': 1,
2193 'b': 1,
2194 'bytes': 1,
2195 'KiB': 1024,
2196 'KB': 1000,
2197 'kB': 1024,
2198 'Kb': 1000,
2199 'kb': 1000,
2200 'kilobytes': 1000,
2201 'kibibytes': 1024,
2202 'MiB': 1024 ** 2,
2203 'MB': 1000 ** 2,
2204 'mB': 1024 ** 2,
2205 'Mb': 1000 ** 2,
2206 'mb': 1000 ** 2,
2207 'megabytes': 1000 ** 2,
2208 'mebibytes': 1024 ** 2,
2209 'GiB': 1024 ** 3,
2210 'GB': 1000 ** 3,
2211 'gB': 1024 ** 3,
2212 'Gb': 1000 ** 3,
2213 'gb': 1000 ** 3,
2214 'gigabytes': 1000 ** 3,
2215 'gibibytes': 1024 ** 3,
2216 'TiB': 1024 ** 4,
2217 'TB': 1000 ** 4,
2218 'tB': 1024 ** 4,
2219 'Tb': 1000 ** 4,
2220 'tb': 1000 ** 4,
2221 'terabytes': 1000 ** 4,
2222 'tebibytes': 1024 ** 4,
2223 'PiB': 1024 ** 5,
2224 'PB': 1000 ** 5,
2225 'pB': 1024 ** 5,
2226 'Pb': 1000 ** 5,
2227 'pb': 1000 ** 5,
2228 'petabytes': 1000 ** 5,
2229 'pebibytes': 1024 ** 5,
2230 'EiB': 1024 ** 6,
2231 'EB': 1000 ** 6,
2232 'eB': 1024 ** 6,
2233 'Eb': 1000 ** 6,
2234 'eb': 1000 ** 6,
2235 'exabytes': 1000 ** 6,
2236 'exbibytes': 1024 ** 6,
2237 'ZiB': 1024 ** 7,
2238 'ZB': 1000 ** 7,
2239 'zB': 1024 ** 7,
2240 'Zb': 1000 ** 7,
2241 'zb': 1000 ** 7,
2242 'zettabytes': 1000 ** 7,
2243 'zebibytes': 1024 ** 7,
2244 'YiB': 1024 ** 8,
2245 'YB': 1000 ** 8,
2246 'yB': 1024 ** 8,
2247 'Yb': 1000 ** 8,
2248 'yb': 1000 ** 8,
2249 'yottabytes': 1000 ** 8,
2250 'yobibytes': 1024 ** 8,
2251 }
2252
2253 return lookup_unit_table(_UNIT_TABLE, s)
2254
2255
2256 def parse_count(s):
2257 if s is None:
2258 return None
2259
2260 s = re.sub(r'^[^\d]+\s', '', s).strip()
2261
2262 if re.match(r'^[\d,.]+$', s):
2263 return str_to_int(s)
2264
2265 _UNIT_TABLE = {
2266 'k': 1000,
2267 'K': 1000,
2268 'm': 1000 ** 2,
2269 'M': 1000 ** 2,
2270 'kk': 1000 ** 2,
2271 'KK': 1000 ** 2,
2272 'b': 1000 ** 3,
2273 'B': 1000 ** 3,
2274 }
2275
2276 ret = lookup_unit_table(_UNIT_TABLE, s)
2277 if ret is not None:
2278 return ret
2279
2280 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2281 if mobj:
2282 return str_to_int(mobj.group(1))
2283
2284
2285 def parse_resolution(s, *, lenient=False):
2286 if s is None:
2287 return {}
2288
2289 if lenient:
2290 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2291 else:
2292 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2293 if mobj:
2294 return {
2295 'width': int(mobj.group('w')),
2296 'height': int(mobj.group('h')),
2297 }
2298
2299 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2300 if mobj:
2301 return {'height': int(mobj.group(1))}
2302
2303 mobj = re.search(r'\b([48])[kK]\b', s)
2304 if mobj:
2305 return {'height': int(mobj.group(1)) * 540}
2306
2307 return {}
2308
2309
2310 def parse_bitrate(s):
2311 if not isinstance(s, str):
2312 return
2313 mobj = re.search(r'\b(\d+)\s*kbps', s)
2314 if mobj:
2315 return int(mobj.group(1))
2316
2317
2318 def month_by_name(name, lang='en'):
2319 """ Return the number of a month by (locale-independently) English name """
2320
2321 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2322
2323 try:
2324 return month_names.index(name) + 1
2325 except ValueError:
2326 return None
2327
2328
2329 def month_by_abbreviation(abbrev):
2330 """ Return the number of a month by (locale-independently) English
2331 abbreviations """
2332
2333 try:
2334 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2335 except ValueError:
2336 return None
2337
2338
2339 def fix_xml_ampersands(xml_str):
2340 """Replace all the '&' by '&amp;' in XML"""
2341 return re.sub(
2342 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2343 '&amp;',
2344 xml_str)
2345
2346
2347 def setproctitle(title):
2348 assert isinstance(title, str)
2349
2350 # ctypes in Jython is not complete
2351 # http://bugs.jython.org/issue2148
2352 if sys.platform.startswith('java'):
2353 return
2354
2355 try:
2356 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2357 except OSError:
2358 return
2359 except TypeError:
2360 # LoadLibrary in Windows Python 2.7.13 only expects
2361 # a bytestring, but since unicode_literals turns
2362 # every string into a unicode string, it fails.
2363 return
2364 title_bytes = title.encode()
2365 buf = ctypes.create_string_buffer(len(title_bytes))
2366 buf.value = title_bytes
2367 try:
2368 libc.prctl(15, buf, 0, 0, 0)
2369 except AttributeError:
2370 return # Strange libc, just skip this
2371
2372
2373 def remove_start(s, start):
2374 return s[len(start):] if s is not None and s.startswith(start) else s
2375
2376
2377 def remove_end(s, end):
2378 return s[:-len(end)] if s is not None and s.endswith(end) else s
2379
2380
2381 def remove_quotes(s):
2382 if s is None or len(s) < 2:
2383 return s
2384 for quote in ('"', "'", ):
2385 if s[0] == quote and s[-1] == quote:
2386 return s[1:-1]
2387 return s
2388
2389
2390 def get_domain(url):
2391 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2392 return domain.group('domain') if domain else None
2393
2394
2395 def url_basename(url):
2396 path = urllib.parse.urlparse(url).path
2397 return path.strip('/').split('/')[-1]
2398
2399
2400 def base_url(url):
2401 return re.match(r'https?://[^?#&]+/', url).group()
2402
2403
2404 def urljoin(base, path):
2405 if isinstance(path, bytes):
2406 path = path.decode()
2407 if not isinstance(path, str) or not path:
2408 return None
2409 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2410 return path
2411 if isinstance(base, bytes):
2412 base = base.decode()
2413 if not isinstance(base, str) or not re.match(
2414 r'^(?:https?:)?//', base):
2415 return None
2416 return urllib.parse.urljoin(base, path)
2417
2418
2419 class HEADRequest(urllib.request.Request):
2420 def get_method(self):
2421 return 'HEAD'
2422
2423
2424 class PUTRequest(urllib.request.Request):
2425 def get_method(self):
2426 return 'PUT'
2427
2428
2429 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2430 if get_attr and v is not None:
2431 v = getattr(v, get_attr, None)
2432 try:
2433 return int(v) * invscale // scale
2434 except (ValueError, TypeError, OverflowError):
2435 return default
2436
2437
2438 def str_or_none(v, default=None):
2439 return default if v is None else str(v)
2440
2441
2442 def str_to_int(int_str):
2443 """ A more relaxed version of int_or_none """
2444 if isinstance(int_str, int):
2445 return int_str
2446 elif isinstance(int_str, str):
2447 int_str = re.sub(r'[,\.\+]', '', int_str)
2448 return int_or_none(int_str)
2449
2450
2451 def float_or_none(v, scale=1, invscale=1, default=None):
2452 if v is None:
2453 return default
2454 try:
2455 return float(v) * invscale / scale
2456 except (ValueError, TypeError):
2457 return default
2458
2459
2460 def bool_or_none(v, default=None):
2461 return v if isinstance(v, bool) else default
2462
2463
2464 def strip_or_none(v, default=None):
2465 return v.strip() if isinstance(v, str) else default
2466
2467
2468 def url_or_none(url):
2469 if not url or not isinstance(url, str):
2470 return None
2471 url = url.strip()
2472 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2473
2474
2475 def request_to_url(req):
2476 if isinstance(req, urllib.request.Request):
2477 return req.get_full_url()
2478 else:
2479 return req
2480
2481
2482 def strftime_or_none(timestamp, date_format, default=None):
2483 datetime_object = None
2484 try:
2485 if isinstance(timestamp, (int, float)): # unix timestamp
2486 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2487 elif isinstance(timestamp, str): # assume YYYYMMDD
2488 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2489 return datetime_object.strftime(date_format)
2490 except (ValueError, TypeError, AttributeError):
2491 return default
2492
2493
2494 def parse_duration(s):
2495 if not isinstance(s, str):
2496 return None
2497 s = s.strip()
2498 if not s:
2499 return None
2500
2501 days, hours, mins, secs, ms = [None] * 5
2502 m = re.match(r'''(?x)
2503 (?P<before_secs>
2504 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2505 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2506 (?P<ms>[.:][0-9]+)?Z?$
2507 ''', s)
2508 if m:
2509 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2510 else:
2511 m = re.match(
2512 r'''(?ix)(?:P?
2513 (?:
2514 [0-9]+\s*y(?:ears?)?,?\s*
2515 )?
2516 (?:
2517 [0-9]+\s*m(?:onths?)?,?\s*
2518 )?
2519 (?:
2520 [0-9]+\s*w(?:eeks?)?,?\s*
2521 )?
2522 (?:
2523 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2524 )?
2525 T)?
2526 (?:
2527 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2528 )?
2529 (?:
2530 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2531 )?
2532 (?:
2533 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2534 )?Z?$''', s)
2535 if m:
2536 days, hours, mins, secs, ms = m.groups()
2537 else:
2538 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2539 if m:
2540 hours, mins = m.groups()
2541 else:
2542 return None
2543
2544 if ms:
2545 ms = ms.replace(':', '.')
2546 return sum(float(part or 0) * mult for part, mult in (
2547 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2548
2549
2550 def prepend_extension(filename, ext, expected_real_ext=None):
2551 name, real_ext = os.path.splitext(filename)
2552 return (
2553 f'{name}.{ext}{real_ext}'
2554 if not expected_real_ext or real_ext[1:] == expected_real_ext
2555 else f'{filename}.{ext}')
2556
2557
2558 def replace_extension(filename, ext, expected_real_ext=None):
2559 name, real_ext = os.path.splitext(filename)
2560 return '{}.{}'.format(
2561 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2562 ext)
2563
2564
2565 def check_executable(exe, args=[]):
2566 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2567 args can be a list of arguments for a short output (like -version) """
2568 try:
2569 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2570 except OSError:
2571 return False
2572 return exe
2573
2574
2575 def _get_exe_version_output(exe, args, *, to_screen=None):
2576 if to_screen:
2577 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2578 try:
2579 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2580 # SIGTTOU if yt-dlp is run in the background.
2581 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2582 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2583 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2584 except OSError:
2585 return False
2586 return stdout
2587
2588
2589 def detect_exe_version(output, version_re=None, unrecognized='present'):
2590 assert isinstance(output, str)
2591 if version_re is None:
2592 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2593 m = re.search(version_re, output)
2594 if m:
2595 return m.group(1)
2596 else:
2597 return unrecognized
2598
2599
2600 def get_exe_version(exe, args=['--version'],
2601 version_re=None, unrecognized='present'):
2602 """ Returns the version of the specified executable,
2603 or False if the executable is not present """
2604 out = _get_exe_version_output(exe, args)
2605 return detect_exe_version(out, version_re, unrecognized) if out else False
2606
2607
2608 def frange(start=0, stop=None, step=1):
2609 """Float range"""
2610 if stop is None:
2611 start, stop = 0, start
2612 sign = [-1, 1][step > 0] if step else 0
2613 while sign * start < sign * stop:
2614 yield start
2615 start += step
2616
2617
2618 class LazyList(collections.abc.Sequence):
2619 """Lazy immutable list from an iterable
2620 Note that slices of a LazyList are lists and not LazyList"""
2621
2622 class IndexError(IndexError):
2623 pass
2624
2625 def __init__(self, iterable, *, reverse=False, _cache=None):
2626 self._iterable = iter(iterable)
2627 self._cache = [] if _cache is None else _cache
2628 self._reversed = reverse
2629
2630 def __iter__(self):
2631 if self._reversed:
2632 # We need to consume the entire iterable to iterate in reverse
2633 yield from self.exhaust()
2634 return
2635 yield from self._cache
2636 for item in self._iterable:
2637 self._cache.append(item)
2638 yield item
2639
2640 def _exhaust(self):
2641 self._cache.extend(self._iterable)
2642 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2643 return self._cache
2644
2645 def exhaust(self):
2646 """Evaluate the entire iterable"""
2647 return self._exhaust()[::-1 if self._reversed else 1]
2648
2649 @staticmethod
2650 def _reverse_index(x):
2651 return None if x is None else -(x + 1)
2652
2653 def __getitem__(self, idx):
2654 if isinstance(idx, slice):
2655 if self._reversed:
2656 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2657 start, stop, step = idx.start, idx.stop, idx.step or 1
2658 elif isinstance(idx, int):
2659 if self._reversed:
2660 idx = self._reverse_index(idx)
2661 start, stop, step = idx, idx, 0
2662 else:
2663 raise TypeError('indices must be integers or slices')
2664 if ((start or 0) < 0 or (stop or 0) < 0
2665 or (start is None and step < 0)
2666 or (stop is None and step > 0)):
2667 # We need to consume the entire iterable to be able to slice from the end
2668 # Obviously, never use this with infinite iterables
2669 self._exhaust()
2670 try:
2671 return self._cache[idx]
2672 except IndexError as e:
2673 raise self.IndexError(e) from e
2674 n = max(start or 0, stop or 0) - len(self._cache) + 1
2675 if n > 0:
2676 self._cache.extend(itertools.islice(self._iterable, n))
2677 try:
2678 return self._cache[idx]
2679 except IndexError as e:
2680 raise self.IndexError(e) from e
2681
2682 def __bool__(self):
2683 try:
2684 self[-1] if self._reversed else self[0]
2685 except self.IndexError:
2686 return False
2687 return True
2688
2689 def __len__(self):
2690 self._exhaust()
2691 return len(self._cache)
2692
2693 def __reversed__(self):
2694 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2695
2696 def __copy__(self):
2697 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2698
2699 def __repr__(self):
2700 # repr and str should mimic a list. So we exhaust the iterable
2701 return repr(self.exhaust())
2702
2703 def __str__(self):
2704 return repr(self.exhaust())
2705
2706
2707 class PagedList:
2708
2709 class IndexError(IndexError):
2710 pass
2711
2712 def __len__(self):
2713 # This is only useful for tests
2714 return len(self.getslice())
2715
2716 def __init__(self, pagefunc, pagesize, use_cache=True):
2717 self._pagefunc = pagefunc
2718 self._pagesize = pagesize
2719 self._pagecount = float('inf')
2720 self._use_cache = use_cache
2721 self._cache = {}
2722
2723 def getpage(self, pagenum):
2724 page_results = self._cache.get(pagenum)
2725 if page_results is None:
2726 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2727 if self._use_cache:
2728 self._cache[pagenum] = page_results
2729 return page_results
2730
2731 def getslice(self, start=0, end=None):
2732 return list(self._getslice(start, end))
2733
2734 def _getslice(self, start, end):
2735 raise NotImplementedError('This method must be implemented by subclasses')
2736
2737 def __getitem__(self, idx):
2738 assert self._use_cache, 'Indexing PagedList requires cache'
2739 if not isinstance(idx, int) or idx < 0:
2740 raise TypeError('indices must be non-negative integers')
2741 entries = self.getslice(idx, idx + 1)
2742 if not entries:
2743 raise self.IndexError()
2744 return entries[0]
2745
2746
2747 class OnDemandPagedList(PagedList):
2748 """Download pages until a page with less than maximum results"""
2749
2750 def _getslice(self, start, end):
2751 for pagenum in itertools.count(start // self._pagesize):
2752 firstid = pagenum * self._pagesize
2753 nextfirstid = pagenum * self._pagesize + self._pagesize
2754 if start >= nextfirstid:
2755 continue
2756
2757 startv = (
2758 start % self._pagesize
2759 if firstid <= start < nextfirstid
2760 else 0)
2761 endv = (
2762 ((end - 1) % self._pagesize) + 1
2763 if (end is not None and firstid <= end <= nextfirstid)
2764 else None)
2765
2766 try:
2767 page_results = self.getpage(pagenum)
2768 except Exception:
2769 self._pagecount = pagenum - 1
2770 raise
2771 if startv != 0 or endv is not None:
2772 page_results = page_results[startv:endv]
2773 yield from page_results
2774
2775 # A little optimization - if current page is not "full", ie. does
2776 # not contain page_size videos then we can assume that this page
2777 # is the last one - there are no more ids on further pages -
2778 # i.e. no need to query again.
2779 if len(page_results) + startv < self._pagesize:
2780 break
2781
2782 # If we got the whole page, but the next page is not interesting,
2783 # break out early as well
2784 if end == nextfirstid:
2785 break
2786
2787
2788 class InAdvancePagedList(PagedList):
2789 """PagedList with total number of pages known in advance"""
2790
2791 def __init__(self, pagefunc, pagecount, pagesize):
2792 PagedList.__init__(self, pagefunc, pagesize, True)
2793 self._pagecount = pagecount
2794
2795 def _getslice(self, start, end):
2796 start_page = start // self._pagesize
2797 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2798 skip_elems = start - start_page * self._pagesize
2799 only_more = None if end is None else end - start
2800 for pagenum in range(start_page, end_page):
2801 page_results = self.getpage(pagenum)
2802 if skip_elems:
2803 page_results = page_results[skip_elems:]
2804 skip_elems = None
2805 if only_more is not None:
2806 if len(page_results) < only_more:
2807 only_more -= len(page_results)
2808 else:
2809 yield from page_results[:only_more]
2810 break
2811 yield from page_results
2812
2813
2814 class PlaylistEntries:
2815 MissingEntry = object()
2816 is_exhausted = False
2817
2818 def __init__(self, ydl, info_dict):
2819 self.ydl = ydl
2820
2821 # _entries must be assigned now since infodict can change during iteration
2822 entries = info_dict.get('entries')
2823 if entries is None:
2824 raise EntryNotInPlaylist('There are no entries')
2825 elif isinstance(entries, list):
2826 self.is_exhausted = True
2827
2828 requested_entries = info_dict.get('requested_entries')
2829 self.is_incomplete = bool(requested_entries)
2830 if self.is_incomplete:
2831 assert self.is_exhausted
2832 self._entries = [self.MissingEntry] * max(requested_entries)
2833 for i, entry in zip(requested_entries, entries):
2834 self._entries[i - 1] = entry
2835 elif isinstance(entries, (list, PagedList, LazyList)):
2836 self._entries = entries
2837 else:
2838 self._entries = LazyList(entries)
2839
2840 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2841 (?P<start>[+-]?\d+)?
2842 (?P<range>[:-]
2843 (?P<end>[+-]?\d+|inf(?:inite)?)?
2844 (?::(?P<step>[+-]?\d+))?
2845 )?''')
2846
2847 @classmethod
2848 def parse_playlist_items(cls, string):
2849 for segment in string.split(','):
2850 if not segment:
2851 raise ValueError('There is two or more consecutive commas')
2852 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2853 if not mobj:
2854 raise ValueError(f'{segment!r} is not a valid specification')
2855 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2856 if int_or_none(step) == 0:
2857 raise ValueError(f'Step in {segment!r} cannot be zero')
2858 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2859
2860 def get_requested_items(self):
2861 playlist_items = self.ydl.params.get('playlist_items')
2862 playlist_start = self.ydl.params.get('playliststart', 1)
2863 playlist_end = self.ydl.params.get('playlistend')
2864 # For backwards compatibility, interpret -1 as whole list
2865 if playlist_end in (-1, None):
2866 playlist_end = ''
2867 if not playlist_items:
2868 playlist_items = f'{playlist_start}:{playlist_end}'
2869 elif playlist_start != 1 or playlist_end:
2870 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2871
2872 for index in self.parse_playlist_items(playlist_items):
2873 for i, entry in self[index]:
2874 yield i, entry
2875 if not entry:
2876 continue
2877 try:
2878 # TODO: Add auto-generated fields
2879 self.ydl._match_entry(entry, incomplete=True, silent=True)
2880 except (ExistingVideoReached, RejectedVideoReached):
2881 return
2882
2883 def get_full_count(self):
2884 if self.is_exhausted and not self.is_incomplete:
2885 return len(self)
2886 elif isinstance(self._entries, InAdvancePagedList):
2887 if self._entries._pagesize == 1:
2888 return self._entries._pagecount
2889
2890 @functools.cached_property
2891 def _getter(self):
2892 if isinstance(self._entries, list):
2893 def get_entry(i):
2894 try:
2895 entry = self._entries[i]
2896 except IndexError:
2897 entry = self.MissingEntry
2898 if not self.is_incomplete:
2899 raise self.IndexError()
2900 if entry is self.MissingEntry:
2901 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2902 return entry
2903 else:
2904 def get_entry(i):
2905 try:
2906 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2907 except (LazyList.IndexError, PagedList.IndexError):
2908 raise self.IndexError()
2909 return get_entry
2910
2911 def __getitem__(self, idx):
2912 if isinstance(idx, int):
2913 idx = slice(idx, idx)
2914
2915 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2916 step = 1 if idx.step is None else idx.step
2917 if idx.start is None:
2918 start = 0 if step > 0 else len(self) - 1
2919 else:
2920 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2921
2922 # NB: Do not call len(self) when idx == [:]
2923 if idx.stop is None:
2924 stop = 0 if step < 0 else float('inf')
2925 else:
2926 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2927 stop += [-1, 1][step > 0]
2928
2929 for i in frange(start, stop, step):
2930 if i < 0:
2931 continue
2932 try:
2933 entry = self._getter(i)
2934 except self.IndexError:
2935 self.is_exhausted = True
2936 if step > 0:
2937 break
2938 continue
2939 yield i + 1, entry
2940
2941 def __len__(self):
2942 return len(tuple(self[:]))
2943
2944 class IndexError(IndexError):
2945 pass
2946
2947
2948 def uppercase_escape(s):
2949 unicode_escape = codecs.getdecoder('unicode_escape')
2950 return re.sub(
2951 r'\\U[0-9a-fA-F]{8}',
2952 lambda m: unicode_escape(m.group(0))[0],
2953 s)
2954
2955
2956 def lowercase_escape(s):
2957 unicode_escape = codecs.getdecoder('unicode_escape')
2958 return re.sub(
2959 r'\\u[0-9a-fA-F]{4}',
2960 lambda m: unicode_escape(m.group(0))[0],
2961 s)
2962
2963
2964 def escape_rfc3986(s):
2965 """Escape non-ASCII characters as suggested by RFC 3986"""
2966 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2967
2968
2969 def escape_url(url):
2970 """Escape URL as suggested by RFC 3986"""
2971 url_parsed = urllib.parse.urlparse(url)
2972 return url_parsed._replace(
2973 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2974 path=escape_rfc3986(url_parsed.path),
2975 params=escape_rfc3986(url_parsed.params),
2976 query=escape_rfc3986(url_parsed.query),
2977 fragment=escape_rfc3986(url_parsed.fragment)
2978 ).geturl()
2979
2980
2981 def parse_qs(url):
2982 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
2983
2984
2985 def read_batch_urls(batch_fd):
2986 def fixup(url):
2987 if not isinstance(url, str):
2988 url = url.decode('utf-8', 'replace')
2989 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2990 for bom in BOM_UTF8:
2991 if url.startswith(bom):
2992 url = url[len(bom):]
2993 url = url.lstrip()
2994 if not url or url.startswith(('#', ';', ']')):
2995 return False
2996 # "#" cannot be stripped out since it is part of the URI
2997 # However, it can be safely stipped out if follwing a whitespace
2998 return re.split(r'\s#', url, 1)[0].rstrip()
2999
3000 with contextlib.closing(batch_fd) as fd:
3001 return [url for url in map(fixup, fd) if url]
3002
3003
3004 def urlencode_postdata(*args, **kargs):
3005 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3006
3007
3008 def update_url_query(url, query):
3009 if not query:
3010 return url
3011 parsed_url = urllib.parse.urlparse(url)
3012 qs = urllib.parse.parse_qs(parsed_url.query)
3013 qs.update(query)
3014 return urllib.parse.urlunparse(parsed_url._replace(
3015 query=urllib.parse.urlencode(qs, True)))
3016
3017
3018 def update_Request(req, url=None, data=None, headers={}, query={}):
3019 req_headers = req.headers.copy()
3020 req_headers.update(headers)
3021 req_data = data or req.data
3022 req_url = update_url_query(url or req.get_full_url(), query)
3023 req_get_method = req.get_method()
3024 if req_get_method == 'HEAD':
3025 req_type = HEADRequest
3026 elif req_get_method == 'PUT':
3027 req_type = PUTRequest
3028 else:
3029 req_type = urllib.request.Request
3030 new_req = req_type(
3031 req_url, data=req_data, headers=req_headers,
3032 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3033 if hasattr(req, 'timeout'):
3034 new_req.timeout = req.timeout
3035 return new_req
3036
3037
3038 def _multipart_encode_impl(data, boundary):
3039 content_type = 'multipart/form-data; boundary=%s' % boundary
3040
3041 out = b''
3042 for k, v in data.items():
3043 out += b'--' + boundary.encode('ascii') + b'\r\n'
3044 if isinstance(k, str):
3045 k = k.encode()
3046 if isinstance(v, str):
3047 v = v.encode()
3048 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3049 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3050 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3051 if boundary.encode('ascii') in content:
3052 raise ValueError('Boundary overlaps with data')
3053 out += content
3054
3055 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3056
3057 return out, content_type
3058
3059
3060 def multipart_encode(data, boundary=None):
3061 '''
3062 Encode a dict to RFC 7578-compliant form-data
3063
3064 data:
3065 A dict where keys and values can be either Unicode or bytes-like
3066 objects.
3067 boundary:
3068 If specified a Unicode object, it's used as the boundary. Otherwise
3069 a random boundary is generated.
3070
3071 Reference: https://tools.ietf.org/html/rfc7578
3072 '''
3073 has_specified_boundary = boundary is not None
3074
3075 while True:
3076 if boundary is None:
3077 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3078
3079 try:
3080 out, content_type = _multipart_encode_impl(data, boundary)
3081 break
3082 except ValueError:
3083 if has_specified_boundary:
3084 raise
3085 boundary = None
3086
3087 return out, content_type
3088
3089
3090 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3091 for val in map(d.get, variadic(key_or_keys)):
3092 if val is not None and (val or not skip_false_values):
3093 return val
3094 return default
3095
3096
3097 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3098 for f in funcs:
3099 try:
3100 val = f(*args, **kwargs)
3101 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3102 pass
3103 else:
3104 if expected_type is None or isinstance(val, expected_type):
3105 return val
3106
3107
3108 def try_get(src, getter, expected_type=None):
3109 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3110
3111
3112 def filter_dict(dct, cndn=lambda _, v: v is not None):
3113 return {k: v for k, v in dct.items() if cndn(k, v)}
3114
3115
3116 def merge_dicts(*dicts):
3117 merged = {}
3118 for a_dict in dicts:
3119 for k, v in a_dict.items():
3120 if (v is not None and k not in merged
3121 or isinstance(v, str) and merged[k] == ''):
3122 merged[k] = v
3123 return merged
3124
3125
3126 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3127 return string if isinstance(string, str) else str(string, encoding, errors)
3128
3129
3130 US_RATINGS = {
3131 'G': 0,
3132 'PG': 10,
3133 'PG-13': 13,
3134 'R': 16,
3135 'NC': 18,
3136 }
3137
3138
3139 TV_PARENTAL_GUIDELINES = {
3140 'TV-Y': 0,
3141 'TV-Y7': 7,
3142 'TV-G': 0,
3143 'TV-PG': 0,
3144 'TV-14': 14,
3145 'TV-MA': 17,
3146 }
3147
3148
3149 def parse_age_limit(s):
3150 # isinstance(False, int) is True. So type() must be used instead
3151 if type(s) is int: # noqa: E721
3152 return s if 0 <= s <= 21 else None
3153 elif not isinstance(s, str):
3154 return None
3155 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3156 if m:
3157 return int(m.group('age'))
3158 s = s.upper()
3159 if s in US_RATINGS:
3160 return US_RATINGS[s]
3161 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3162 if m:
3163 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3164 return None
3165
3166
3167 def strip_jsonp(code):
3168 return re.sub(
3169 r'''(?sx)^
3170 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3171 (?:\s*&&\s*(?P=func_name))?
3172 \s*\(\s*(?P<callback_data>.*)\);?
3173 \s*?(?://[^\n]*)*$''',
3174 r'\g<callback_data>', code)
3175
3176
3177 def js_to_json(code, vars={}):
3178 # vars is a dict of var, val pairs to substitute
3179 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3180 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3181 INTEGER_TABLE = (
3182 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3183 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3184 )
3185
3186 def fix_kv(m):
3187 v = m.group(0)
3188 if v in ('true', 'false', 'null'):
3189 return v
3190 elif v in ('undefined', 'void 0'):
3191 return 'null'
3192 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3193 return ""
3194
3195 if v[0] in ("'", '"'):
3196 v = re.sub(r'(?s)\\.|"', lambda m: {
3197 '"': '\\"',
3198 "\\'": "'",
3199 '\\\n': '',
3200 '\\x': '\\u00',
3201 }.get(m.group(0), m.group(0)), v[1:-1])
3202 else:
3203 for regex, base in INTEGER_TABLE:
3204 im = re.match(regex, v)
3205 if im:
3206 i = int(im.group(1), base)
3207 return '"%d":' % i if v.endswith(':') else '%d' % i
3208
3209 if v in vars:
3210 return vars[v]
3211
3212 return '"%s"' % v
3213
3214 def create_map(mobj):
3215 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3216
3217 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3218 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3219
3220 return re.sub(r'''(?sx)
3221 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3222 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3223 {comment}|,(?={skip}[\]}}])|
3224 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3225 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3226 [0-9]+(?={skip}:)|
3227 !+
3228 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3229
3230
3231 def qualities(quality_ids):
3232 """ Get a numeric quality value out of a list of possible values """
3233 def q(qid):
3234 try:
3235 return quality_ids.index(qid)
3236 except ValueError:
3237 return -1
3238 return q
3239
3240
3241 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3242
3243
3244 DEFAULT_OUTTMPL = {
3245 'default': '%(title)s [%(id)s].%(ext)s',
3246 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3247 }
3248 OUTTMPL_TYPES = {
3249 'chapter': None,
3250 'subtitle': None,
3251 'thumbnail': None,
3252 'description': 'description',
3253 'annotation': 'annotations.xml',
3254 'infojson': 'info.json',
3255 'link': None,
3256 'pl_video': None,
3257 'pl_thumbnail': None,
3258 'pl_description': 'description',
3259 'pl_infojson': 'info.json',
3260 }
3261
3262 # As of [1] format syntax is:
3263 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3264 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3265 STR_FORMAT_RE_TMPL = r'''(?x)
3266 (?<!%)(?P<prefix>(?:%%)*)
3267 %
3268 (?P<has_key>\((?P<key>{0})\))?
3269 (?P<format>
3270 (?P<conversion>[#0\-+ ]+)?
3271 (?P<min_width>\d+)?
3272 (?P<precision>\.\d+)?
3273 (?P<len_mod>[hlL])? # unused in python
3274 {1} # conversion type
3275 )
3276 '''
3277
3278
3279 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3280
3281
3282 def limit_length(s, length):
3283 """ Add ellipses to overly long strings """
3284 if s is None:
3285 return None
3286 ELLIPSES = '...'
3287 if len(s) > length:
3288 return s[:length - len(ELLIPSES)] + ELLIPSES
3289 return s
3290
3291
3292 def version_tuple(v):
3293 return tuple(int(e) for e in re.split(r'[-.]', v))
3294
3295
3296 def is_outdated_version(version, limit, assume_new=True):
3297 if not version:
3298 return not assume_new
3299 try:
3300 return version_tuple(version) < version_tuple(limit)
3301 except ValueError:
3302 return not assume_new
3303
3304
3305 def ytdl_is_updateable():
3306 """ Returns if yt-dlp can be updated with -U """
3307
3308 from .update import is_non_updateable
3309
3310 return not is_non_updateable()
3311
3312
3313 def args_to_str(args):
3314 # Get a short string representation for a subprocess command
3315 return ' '.join(compat_shlex_quote(a) for a in args)
3316
3317
3318 def error_to_compat_str(err):
3319 return str(err)
3320
3321
3322 def error_to_str(err):
3323 return f'{type(err).__name__}: {err}'
3324
3325
3326 def mimetype2ext(mt):
3327 if mt is None:
3328 return None
3329
3330 mt, _, params = mt.partition(';')
3331 mt = mt.strip()
3332
3333 FULL_MAP = {
3334 'audio/mp4': 'm4a',
3335 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3336 # it's the most popular one
3337 'audio/mpeg': 'mp3',
3338 'audio/x-wav': 'wav',
3339 'audio/wav': 'wav',
3340 'audio/wave': 'wav',
3341 }
3342
3343 ext = FULL_MAP.get(mt)
3344 if ext is not None:
3345 return ext
3346
3347 SUBTYPE_MAP = {
3348 '3gpp': '3gp',
3349 'smptett+xml': 'tt',
3350 'ttaf+xml': 'dfxp',
3351 'ttml+xml': 'ttml',
3352 'x-flv': 'flv',
3353 'x-mp4-fragmented': 'mp4',
3354 'x-ms-sami': 'sami',
3355 'x-ms-wmv': 'wmv',
3356 'mpegurl': 'm3u8',
3357 'x-mpegurl': 'm3u8',
3358 'vnd.apple.mpegurl': 'm3u8',
3359 'dash+xml': 'mpd',
3360 'f4m+xml': 'f4m',
3361 'hds+xml': 'f4m',
3362 'vnd.ms-sstr+xml': 'ism',
3363 'quicktime': 'mov',
3364 'mp2t': 'ts',
3365 'x-wav': 'wav',
3366 'filmstrip+json': 'fs',
3367 'svg+xml': 'svg',
3368 }
3369
3370 _, _, subtype = mt.rpartition('/')
3371 ext = SUBTYPE_MAP.get(subtype.lower())
3372 if ext is not None:
3373 return ext
3374
3375 SUFFIX_MAP = {
3376 'json': 'json',
3377 'xml': 'xml',
3378 'zip': 'zip',
3379 'gzip': 'gz',
3380 }
3381
3382 _, _, suffix = subtype.partition('+')
3383 ext = SUFFIX_MAP.get(suffix)
3384 if ext is not None:
3385 return ext
3386
3387 return subtype.replace('+', '.')
3388
3389
3390 def ext2mimetype(ext_or_url):
3391 if not ext_or_url:
3392 return None
3393 if '.' not in ext_or_url:
3394 ext_or_url = f'file.{ext_or_url}'
3395 return mimetypes.guess_type(ext_or_url)[0]
3396
3397
3398 def parse_codecs(codecs_str):
3399 # http://tools.ietf.org/html/rfc6381
3400 if not codecs_str:
3401 return {}
3402 split_codecs = list(filter(None, map(
3403 str.strip, codecs_str.strip().strip(',').split(','))))
3404 vcodec, acodec, scodec, hdr = None, None, None, None
3405 for full_codec in split_codecs:
3406 parts = full_codec.split('.')
3407 codec = parts[0].replace('0', '')
3408 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3409 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3410 if not vcodec:
3411 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3412 if codec in ('dvh1', 'dvhe'):
3413 hdr = 'DV'
3414 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3415 hdr = 'HDR10'
3416 elif full_codec.replace('0', '').startswith('vp9.2'):
3417 hdr = 'HDR10'
3418 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3419 if not acodec:
3420 acodec = full_codec
3421 elif codec in ('stpp', 'wvtt',):
3422 if not scodec:
3423 scodec = full_codec
3424 else:
3425 write_string(f'WARNING: Unknown codec {full_codec}\n')
3426 if vcodec or acodec or scodec:
3427 return {
3428 'vcodec': vcodec or 'none',
3429 'acodec': acodec or 'none',
3430 'dynamic_range': hdr,
3431 **({'scodec': scodec} if scodec is not None else {}),
3432 }
3433 elif len(split_codecs) == 2:
3434 return {
3435 'vcodec': split_codecs[0],
3436 'acodec': split_codecs[1],
3437 }
3438 return {}
3439
3440
3441 def urlhandle_detect_ext(url_handle):
3442 getheader = url_handle.headers.get
3443
3444 cd = getheader('Content-Disposition')
3445 if cd:
3446 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3447 if m:
3448 e = determine_ext(m.group('filename'), default_ext=None)
3449 if e:
3450 return e
3451
3452 return mimetype2ext(getheader('Content-Type'))
3453
3454
3455 def encode_data_uri(data, mime_type):
3456 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3457
3458
3459 def age_restricted(content_limit, age_limit):
3460 """ Returns True iff the content should be blocked """
3461
3462 if age_limit is None: # No limit set
3463 return False
3464 if content_limit is None:
3465 return False # Content available for everyone
3466 return age_limit < content_limit
3467
3468
3469 def is_html(first_bytes):
3470 """ Detect whether a file contains HTML by examining its first bytes. """
3471
3472 BOMS = [
3473 (b'\xef\xbb\xbf', 'utf-8'),
3474 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3475 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3476 (b'\xff\xfe', 'utf-16-le'),
3477 (b'\xfe\xff', 'utf-16-be'),
3478 ]
3479
3480 encoding = 'utf-8'
3481 for bom, enc in BOMS:
3482 while first_bytes.startswith(bom):
3483 encoding, first_bytes = enc, first_bytes[len(bom):]
3484
3485 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3486
3487
3488 def determine_protocol(info_dict):
3489 protocol = info_dict.get('protocol')
3490 if protocol is not None:
3491 return protocol
3492
3493 url = sanitize_url(info_dict['url'])
3494 if url.startswith('rtmp'):
3495 return 'rtmp'
3496 elif url.startswith('mms'):
3497 return 'mms'
3498 elif url.startswith('rtsp'):
3499 return 'rtsp'
3500
3501 ext = determine_ext(url)
3502 if ext == 'm3u8':
3503 return 'm3u8'
3504 elif ext == 'f4m':
3505 return 'f4m'
3506
3507 return urllib.parse.urlparse(url).scheme
3508
3509
3510 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3511 """ Render a list of rows, each as a list of values.
3512 Text after a \t will be right aligned """
3513 def width(string):
3514 return len(remove_terminal_sequences(string).replace('\t', ''))
3515
3516 def get_max_lens(table):
3517 return [max(width(str(v)) for v in col) for col in zip(*table)]
3518
3519 def filter_using_list(row, filterArray):
3520 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3521
3522 max_lens = get_max_lens(data) if hide_empty else []
3523 header_row = filter_using_list(header_row, max_lens)
3524 data = [filter_using_list(row, max_lens) for row in data]
3525
3526 table = [header_row] + data
3527 max_lens = get_max_lens(table)
3528 extra_gap += 1
3529 if delim:
3530 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3531 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3532 for row in table:
3533 for pos, text in enumerate(map(str, row)):
3534 if '\t' in text:
3535 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3536 else:
3537 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3538 ret = '\n'.join(''.join(row).rstrip() for row in table)
3539 return ret
3540
3541
3542 def _match_one(filter_part, dct, incomplete):
3543 # TODO: Generalize code with YoutubeDL._build_format_filter
3544 STRING_OPERATORS = {
3545 '*=': operator.contains,
3546 '^=': lambda attr, value: attr.startswith(value),
3547 '$=': lambda attr, value: attr.endswith(value),
3548 '~=': lambda attr, value: re.search(value, attr),
3549 }
3550 COMPARISON_OPERATORS = {
3551 **STRING_OPERATORS,
3552 '<=': operator.le, # "<=" must be defined above "<"
3553 '<': operator.lt,
3554 '>=': operator.ge,
3555 '>': operator.gt,
3556 '=': operator.eq,
3557 }
3558
3559 if isinstance(incomplete, bool):
3560 is_incomplete = lambda _: incomplete
3561 else:
3562 is_incomplete = lambda k: k in incomplete
3563
3564 operator_rex = re.compile(r'''(?x)
3565 (?P<key>[a-z_]+)
3566 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3567 (?:
3568 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3569 (?P<strval>.+?)
3570 )
3571 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3572 m = operator_rex.fullmatch(filter_part.strip())
3573 if m:
3574 m = m.groupdict()
3575 unnegated_op = COMPARISON_OPERATORS[m['op']]
3576 if m['negation']:
3577 op = lambda attr, value: not unnegated_op(attr, value)
3578 else:
3579 op = unnegated_op
3580 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3581 if m['quote']:
3582 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3583 actual_value = dct.get(m['key'])
3584 numeric_comparison = None
3585 if isinstance(actual_value, (int, float)):
3586 # If the original field is a string and matching comparisonvalue is
3587 # a number we should respect the origin of the original field
3588 # and process comparison value as a string (see
3589 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3590 try:
3591 numeric_comparison = int(comparison_value)
3592 except ValueError:
3593 numeric_comparison = parse_filesize(comparison_value)
3594 if numeric_comparison is None:
3595 numeric_comparison = parse_filesize(f'{comparison_value}B')
3596 if numeric_comparison is None:
3597 numeric_comparison = parse_duration(comparison_value)
3598 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3599 raise ValueError('Operator %s only supports string values!' % m['op'])
3600 if actual_value is None:
3601 return is_incomplete(m['key']) or m['none_inclusive']
3602 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3603
3604 UNARY_OPERATORS = {
3605 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3606 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3607 }
3608 operator_rex = re.compile(r'''(?x)
3609 (?P<op>%s)\s*(?P<key>[a-z_]+)
3610 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3611 m = operator_rex.fullmatch(filter_part.strip())
3612 if m:
3613 op = UNARY_OPERATORS[m.group('op')]
3614 actual_value = dct.get(m.group('key'))
3615 if is_incomplete(m.group('key')) and actual_value is None:
3616 return True
3617 return op(actual_value)
3618
3619 raise ValueError('Invalid filter part %r' % filter_part)
3620
3621
3622 def match_str(filter_str, dct, incomplete=False):
3623 """ Filter a dictionary with a simple string syntax.
3624 @returns Whether the filter passes
3625 @param incomplete Set of keys that is expected to be missing from dct.
3626 Can be True/False to indicate all/none of the keys may be missing.
3627 All conditions on incomplete keys pass if the key is missing
3628 """
3629 return all(
3630 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3631 for filter_part in re.split(r'(?<!\\)&', filter_str))
3632
3633
3634 def match_filter_func(filters):
3635 if not filters:
3636 return None
3637 filters = set(variadic(filters))
3638
3639 interactive = '-' in filters
3640 if interactive:
3641 filters.remove('-')
3642
3643 def _match_func(info_dict, incomplete=False):
3644 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3645 return NO_DEFAULT if interactive and not incomplete else None
3646 else:
3647 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3648 filter_str = ') | ('.join(map(str.strip, filters))
3649 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3650 return _match_func
3651
3652
3653 def download_range_func(chapters, ranges):
3654 def inner(info_dict, ydl):
3655 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3656 else 'Cannot match chapters since chapter information is unavailable')
3657 for regex in chapters or []:
3658 for i, chapter in enumerate(info_dict.get('chapters') or []):
3659 if re.search(regex, chapter['title']):
3660 warning = None
3661 yield {**chapter, 'index': i}
3662 if chapters and warning:
3663 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3664
3665 yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3666
3667 return inner
3668
3669
3670 def parse_dfxp_time_expr(time_expr):
3671 if not time_expr:
3672 return
3673
3674 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3675 if mobj:
3676 return float(mobj.group('time_offset'))
3677
3678 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3679 if mobj:
3680 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3681
3682
3683 def srt_subtitles_timecode(seconds):
3684 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3685
3686
3687 def ass_subtitles_timecode(seconds):
3688 time = timetuple_from_msec(seconds * 1000)
3689 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3690
3691
3692 def dfxp2srt(dfxp_data):
3693 '''
3694 @param dfxp_data A bytes-like object containing DFXP data
3695 @returns A unicode object containing converted SRT data
3696 '''
3697 LEGACY_NAMESPACES = (
3698 (b'http://www.w3.org/ns/ttml', [
3699 b'http://www.w3.org/2004/11/ttaf1',
3700 b'http://www.w3.org/2006/04/ttaf1',
3701 b'http://www.w3.org/2006/10/ttaf1',
3702 ]),
3703 (b'http://www.w3.org/ns/ttml#styling', [
3704 b'http://www.w3.org/ns/ttml#style',
3705 ]),
3706 )
3707
3708 SUPPORTED_STYLING = [
3709 'color',
3710 'fontFamily',
3711 'fontSize',
3712 'fontStyle',
3713 'fontWeight',
3714 'textDecoration'
3715 ]
3716
3717 _x = functools.partial(xpath_with_ns, ns_map={
3718 'xml': 'http://www.w3.org/XML/1998/namespace',
3719 'ttml': 'http://www.w3.org/ns/ttml',
3720 'tts': 'http://www.w3.org/ns/ttml#styling',
3721 })
3722
3723 styles = {}
3724 default_style = {}
3725
3726 class TTMLPElementParser:
3727 _out = ''
3728 _unclosed_elements = []
3729 _applied_styles = []
3730
3731 def start(self, tag, attrib):
3732 if tag in (_x('ttml:br'), 'br'):
3733 self._out += '\n'
3734 else:
3735 unclosed_elements = []
3736 style = {}
3737 element_style_id = attrib.get('style')
3738 if default_style:
3739 style.update(default_style)
3740 if element_style_id:
3741 style.update(styles.get(element_style_id, {}))
3742 for prop in SUPPORTED_STYLING:
3743 prop_val = attrib.get(_x('tts:' + prop))
3744 if prop_val:
3745 style[prop] = prop_val
3746 if style:
3747 font = ''
3748 for k, v in sorted(style.items()):
3749 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3750 continue
3751 if k == 'color':
3752 font += ' color="%s"' % v
3753 elif k == 'fontSize':
3754 font += ' size="%s"' % v
3755 elif k == 'fontFamily':
3756 font += ' face="%s"' % v
3757 elif k == 'fontWeight' and v == 'bold':
3758 self._out += '<b>'
3759 unclosed_elements.append('b')
3760 elif k == 'fontStyle' and v == 'italic':
3761 self._out += '<i>'
3762 unclosed_elements.append('i')
3763 elif k == 'textDecoration' and v == 'underline':
3764 self._out += '<u>'
3765 unclosed_elements.append('u')
3766 if font:
3767 self._out += '<font' + font + '>'
3768 unclosed_elements.append('font')
3769 applied_style = {}
3770 if self._applied_styles:
3771 applied_style.update(self._applied_styles[-1])
3772 applied_style.update(style)
3773 self._applied_styles.append(applied_style)
3774 self._unclosed_elements.append(unclosed_elements)
3775
3776 def end(self, tag):
3777 if tag not in (_x('ttml:br'), 'br'):
3778 unclosed_elements = self._unclosed_elements.pop()
3779 for element in reversed(unclosed_elements):
3780 self._out += '</%s>' % element
3781 if unclosed_elements and self._applied_styles:
3782 self._applied_styles.pop()
3783
3784 def data(self, data):
3785 self._out += data
3786
3787 def close(self):
3788 return self._out.strip()
3789
3790 def parse_node(node):
3791 target = TTMLPElementParser()
3792 parser = xml.etree.ElementTree.XMLParser(target=target)
3793 parser.feed(xml.etree.ElementTree.tostring(node))
3794 return parser.close()
3795
3796 for k, v in LEGACY_NAMESPACES:
3797 for ns in v:
3798 dfxp_data = dfxp_data.replace(ns, k)
3799
3800 dfxp = compat_etree_fromstring(dfxp_data)
3801 out = []
3802 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3803
3804 if not paras:
3805 raise ValueError('Invalid dfxp/TTML subtitle')
3806
3807 repeat = False
3808 while True:
3809 for style in dfxp.findall(_x('.//ttml:style')):
3810 style_id = style.get('id') or style.get(_x('xml:id'))
3811 if not style_id:
3812 continue
3813 parent_style_id = style.get('style')
3814 if parent_style_id:
3815 if parent_style_id not in styles:
3816 repeat = True
3817 continue
3818 styles[style_id] = styles[parent_style_id].copy()
3819 for prop in SUPPORTED_STYLING:
3820 prop_val = style.get(_x('tts:' + prop))
3821 if prop_val:
3822 styles.setdefault(style_id, {})[prop] = prop_val
3823 if repeat:
3824 repeat = False
3825 else:
3826 break
3827
3828 for p in ('body', 'div'):
3829 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3830 if ele is None:
3831 continue
3832 style = styles.get(ele.get('style'))
3833 if not style:
3834 continue
3835 default_style.update(style)
3836
3837 for para, index in zip(paras, itertools.count(1)):
3838 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3839 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3840 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3841 if begin_time is None:
3842 continue
3843 if not end_time:
3844 if not dur:
3845 continue
3846 end_time = begin_time + dur
3847 out.append('%d\n%s --> %s\n%s\n\n' % (
3848 index,
3849 srt_subtitles_timecode(begin_time),
3850 srt_subtitles_timecode(end_time),
3851 parse_node(para)))
3852
3853 return ''.join(out)
3854
3855
3856 def cli_option(params, command_option, param, separator=None):
3857 param = params.get(param)
3858 return ([] if param is None
3859 else [command_option, str(param)] if separator is None
3860 else [f'{command_option}{separator}{param}'])
3861
3862
3863 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3864 param = params.get(param)
3865 assert param in (True, False, None)
3866 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3867
3868
3869 def cli_valueless_option(params, command_option, param, expected_value=True):
3870 return [command_option] if params.get(param) == expected_value else []
3871
3872
3873 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3874 if isinstance(argdict, (list, tuple)): # for backward compatibility
3875 if use_compat:
3876 return argdict
3877 else:
3878 argdict = None
3879 if argdict is None:
3880 return default
3881 assert isinstance(argdict, dict)
3882
3883 assert isinstance(keys, (list, tuple))
3884 for key_list in keys:
3885 arg_list = list(filter(
3886 lambda x: x is not None,
3887 [argdict.get(key.lower()) for key in variadic(key_list)]))
3888 if arg_list:
3889 return [arg for args in arg_list for arg in args]
3890 return default
3891
3892
3893 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3894 main_key, exe = main_key.lower(), exe.lower()
3895 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3896 keys = [f'{root_key}{k}' for k in (keys or [''])]
3897 if root_key in keys:
3898 if main_key != exe:
3899 keys.append((main_key, exe))
3900 keys.append('default')
3901 else:
3902 use_compat = False
3903 return cli_configuration_args(argdict, keys, default, use_compat)
3904
3905
3906 class ISO639Utils:
3907 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3908 _lang_map = {
3909 'aa': 'aar',
3910 'ab': 'abk',
3911 'ae': 'ave',
3912 'af': 'afr',
3913 'ak': 'aka',
3914 'am': 'amh',
3915 'an': 'arg',
3916 'ar': 'ara',
3917 'as': 'asm',
3918 'av': 'ava',
3919 'ay': 'aym',
3920 'az': 'aze',
3921 'ba': 'bak',
3922 'be': 'bel',
3923 'bg': 'bul',
3924 'bh': 'bih',
3925 'bi': 'bis',
3926 'bm': 'bam',
3927 'bn': 'ben',
3928 'bo': 'bod',
3929 'br': 'bre',
3930 'bs': 'bos',
3931 'ca': 'cat',
3932 'ce': 'che',
3933 'ch': 'cha',
3934 'co': 'cos',
3935 'cr': 'cre',
3936 'cs': 'ces',
3937 'cu': 'chu',
3938 'cv': 'chv',
3939 'cy': 'cym',
3940 'da': 'dan',
3941 'de': 'deu',
3942 'dv': 'div',
3943 'dz': 'dzo',
3944 'ee': 'ewe',
3945 'el': 'ell',
3946 'en': 'eng',
3947 'eo': 'epo',
3948 'es': 'spa',
3949 'et': 'est',
3950 'eu': 'eus',
3951 'fa': 'fas',
3952 'ff': 'ful',
3953 'fi': 'fin',
3954 'fj': 'fij',
3955 'fo': 'fao',
3956 'fr': 'fra',
3957 'fy': 'fry',
3958 'ga': 'gle',
3959 'gd': 'gla',
3960 'gl': 'glg',
3961 'gn': 'grn',
3962 'gu': 'guj',
3963 'gv': 'glv',
3964 'ha': 'hau',
3965 'he': 'heb',
3966 'iw': 'heb', # Replaced by he in 1989 revision
3967 'hi': 'hin',
3968 'ho': 'hmo',
3969 'hr': 'hrv',
3970 'ht': 'hat',
3971 'hu': 'hun',
3972 'hy': 'hye',
3973 'hz': 'her',
3974 'ia': 'ina',
3975 'id': 'ind',
3976 'in': 'ind', # Replaced by id in 1989 revision
3977 'ie': 'ile',
3978 'ig': 'ibo',
3979 'ii': 'iii',
3980 'ik': 'ipk',
3981 'io': 'ido',
3982 'is': 'isl',
3983 'it': 'ita',
3984 'iu': 'iku',
3985 'ja': 'jpn',
3986 'jv': 'jav',
3987 'ka': 'kat',
3988 'kg': 'kon',
3989 'ki': 'kik',
3990 'kj': 'kua',
3991 'kk': 'kaz',
3992 'kl': 'kal',
3993 'km': 'khm',
3994 'kn': 'kan',
3995 'ko': 'kor',
3996 'kr': 'kau',
3997 'ks': 'kas',
3998 'ku': 'kur',
3999 'kv': 'kom',
4000 'kw': 'cor',
4001 'ky': 'kir',
4002 'la': 'lat',
4003 'lb': 'ltz',
4004 'lg': 'lug',
4005 'li': 'lim',
4006 'ln': 'lin',
4007 'lo': 'lao',
4008 'lt': 'lit',
4009 'lu': 'lub',
4010 'lv': 'lav',
4011 'mg': 'mlg',
4012 'mh': 'mah',
4013 'mi': 'mri',
4014 'mk': 'mkd',
4015 'ml': 'mal',
4016 'mn': 'mon',
4017 'mr': 'mar',
4018 'ms': 'msa',
4019 'mt': 'mlt',
4020 'my': 'mya',
4021 'na': 'nau',
4022 'nb': 'nob',
4023 'nd': 'nde',
4024 'ne': 'nep',
4025 'ng': 'ndo',
4026 'nl': 'nld',
4027 'nn': 'nno',
4028 'no': 'nor',
4029 'nr': 'nbl',
4030 'nv': 'nav',
4031 'ny': 'nya',
4032 'oc': 'oci',
4033 'oj': 'oji',
4034 'om': 'orm',
4035 'or': 'ori',
4036 'os': 'oss',
4037 'pa': 'pan',
4038 'pi': 'pli',
4039 'pl': 'pol',
4040 'ps': 'pus',
4041 'pt': 'por',
4042 'qu': 'que',
4043 'rm': 'roh',
4044 'rn': 'run',
4045 'ro': 'ron',
4046 'ru': 'rus',
4047 'rw': 'kin',
4048 'sa': 'san',
4049 'sc': 'srd',
4050 'sd': 'snd',
4051 'se': 'sme',
4052 'sg': 'sag',
4053 'si': 'sin',
4054 'sk': 'slk',
4055 'sl': 'slv',
4056 'sm': 'smo',
4057 'sn': 'sna',
4058 'so': 'som',
4059 'sq': 'sqi',
4060 'sr': 'srp',
4061 'ss': 'ssw',
4062 'st': 'sot',
4063 'su': 'sun',
4064 'sv': 'swe',
4065 'sw': 'swa',
4066 'ta': 'tam',
4067 'te': 'tel',
4068 'tg': 'tgk',
4069 'th': 'tha',
4070 'ti': 'tir',
4071 'tk': 'tuk',
4072 'tl': 'tgl',
4073 'tn': 'tsn',
4074 'to': 'ton',
4075 'tr': 'tur',
4076 'ts': 'tso',
4077 'tt': 'tat',
4078 'tw': 'twi',
4079 'ty': 'tah',
4080 'ug': 'uig',
4081 'uk': 'ukr',
4082 'ur': 'urd',
4083 'uz': 'uzb',
4084 've': 'ven',
4085 'vi': 'vie',
4086 'vo': 'vol',
4087 'wa': 'wln',
4088 'wo': 'wol',
4089 'xh': 'xho',
4090 'yi': 'yid',
4091 'ji': 'yid', # Replaced by yi in 1989 revision
4092 'yo': 'yor',
4093 'za': 'zha',
4094 'zh': 'zho',
4095 'zu': 'zul',
4096 }
4097
4098 @classmethod
4099 def short2long(cls, code):
4100 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4101 return cls._lang_map.get(code[:2])
4102
4103 @classmethod
4104 def long2short(cls, code):
4105 """Convert language code from ISO 639-2/T to ISO 639-1"""
4106 for short_name, long_name in cls._lang_map.items():
4107 if long_name == code:
4108 return short_name
4109
4110
4111 class ISO3166Utils:
4112 # From http://data.okfn.org/data/core/country-list
4113 _country_map = {
4114 'AF': 'Afghanistan',
4115 'AX': 'Åland Islands',
4116 'AL': 'Albania',
4117 'DZ': 'Algeria',
4118 'AS': 'American Samoa',
4119 'AD': 'Andorra',
4120 'AO': 'Angola',
4121 'AI': 'Anguilla',
4122 'AQ': 'Antarctica',
4123 'AG': 'Antigua and Barbuda',
4124 'AR': 'Argentina',
4125 'AM': 'Armenia',
4126 'AW': 'Aruba',
4127 'AU': 'Australia',
4128 'AT': 'Austria',
4129 'AZ': 'Azerbaijan',
4130 'BS': 'Bahamas',
4131 'BH': 'Bahrain',
4132 'BD': 'Bangladesh',
4133 'BB': 'Barbados',
4134 'BY': 'Belarus',
4135 'BE': 'Belgium',
4136 'BZ': 'Belize',
4137 'BJ': 'Benin',
4138 'BM': 'Bermuda',
4139 'BT': 'Bhutan',
4140 'BO': 'Bolivia, Plurinational State of',
4141 'BQ': 'Bonaire, Sint Eustatius and Saba',
4142 'BA': 'Bosnia and Herzegovina',
4143 'BW': 'Botswana',
4144 'BV': 'Bouvet Island',
4145 'BR': 'Brazil',
4146 'IO': 'British Indian Ocean Territory',
4147 'BN': 'Brunei Darussalam',
4148 'BG': 'Bulgaria',
4149 'BF': 'Burkina Faso',
4150 'BI': 'Burundi',
4151 'KH': 'Cambodia',
4152 'CM': 'Cameroon',
4153 'CA': 'Canada',
4154 'CV': 'Cape Verde',
4155 'KY': 'Cayman Islands',
4156 'CF': 'Central African Republic',
4157 'TD': 'Chad',
4158 'CL': 'Chile',
4159 'CN': 'China',
4160 'CX': 'Christmas Island',
4161 'CC': 'Cocos (Keeling) Islands',
4162 'CO': 'Colombia',
4163 'KM': 'Comoros',
4164 'CG': 'Congo',
4165 'CD': 'Congo, the Democratic Republic of the',
4166 'CK': 'Cook Islands',
4167 'CR': 'Costa Rica',
4168 'CI': 'Côte d\'Ivoire',
4169 'HR': 'Croatia',
4170 'CU': 'Cuba',
4171 'CW': 'Curaçao',
4172 'CY': 'Cyprus',
4173 'CZ': 'Czech Republic',
4174 'DK': 'Denmark',
4175 'DJ': 'Djibouti',
4176 'DM': 'Dominica',
4177 'DO': 'Dominican Republic',
4178 'EC': 'Ecuador',
4179 'EG': 'Egypt',
4180 'SV': 'El Salvador',
4181 'GQ': 'Equatorial Guinea',
4182 'ER': 'Eritrea',
4183 'EE': 'Estonia',
4184 'ET': 'Ethiopia',
4185 'FK': 'Falkland Islands (Malvinas)',
4186 'FO': 'Faroe Islands',
4187 'FJ': 'Fiji',
4188 'FI': 'Finland',
4189 'FR': 'France',
4190 'GF': 'French Guiana',
4191 'PF': 'French Polynesia',
4192 'TF': 'French Southern Territories',
4193 'GA': 'Gabon',
4194 'GM': 'Gambia',
4195 'GE': 'Georgia',
4196 'DE': 'Germany',
4197 'GH': 'Ghana',
4198 'GI': 'Gibraltar',
4199 'GR': 'Greece',
4200 'GL': 'Greenland',
4201 'GD': 'Grenada',
4202 'GP': 'Guadeloupe',
4203 'GU': 'Guam',
4204 'GT': 'Guatemala',
4205 'GG': 'Guernsey',
4206 'GN': 'Guinea',
4207 'GW': 'Guinea-Bissau',
4208 'GY': 'Guyana',
4209 'HT': 'Haiti',
4210 'HM': 'Heard Island and McDonald Islands',
4211 'VA': 'Holy See (Vatican City State)',
4212 'HN': 'Honduras',
4213 'HK': 'Hong Kong',
4214 'HU': 'Hungary',
4215 'IS': 'Iceland',
4216 'IN': 'India',
4217 'ID': 'Indonesia',
4218 'IR': 'Iran, Islamic Republic of',
4219 'IQ': 'Iraq',
4220 'IE': 'Ireland',
4221 'IM': 'Isle of Man',
4222 'IL': 'Israel',
4223 'IT': 'Italy',
4224 'JM': 'Jamaica',
4225 'JP': 'Japan',
4226 'JE': 'Jersey',
4227 'JO': 'Jordan',
4228 'KZ': 'Kazakhstan',
4229 'KE': 'Kenya',
4230 'KI': 'Kiribati',
4231 'KP': 'Korea, Democratic People\'s Republic of',
4232 'KR': 'Korea, Republic of',
4233 'KW': 'Kuwait',
4234 'KG': 'Kyrgyzstan',
4235 'LA': 'Lao People\'s Democratic Republic',
4236 'LV': 'Latvia',
4237 'LB': 'Lebanon',
4238 'LS': 'Lesotho',
4239 'LR': 'Liberia',
4240 'LY': 'Libya',
4241 'LI': 'Liechtenstein',
4242 'LT': 'Lithuania',
4243 'LU': 'Luxembourg',
4244 'MO': 'Macao',
4245 'MK': 'Macedonia, the Former Yugoslav Republic of',
4246 'MG': 'Madagascar',
4247 'MW': 'Malawi',
4248 'MY': 'Malaysia',
4249 'MV': 'Maldives',
4250 'ML': 'Mali',
4251 'MT': 'Malta',
4252 'MH': 'Marshall Islands',
4253 'MQ': 'Martinique',
4254 'MR': 'Mauritania',
4255 'MU': 'Mauritius',
4256 'YT': 'Mayotte',
4257 'MX': 'Mexico',
4258 'FM': 'Micronesia, Federated States of',
4259 'MD': 'Moldova, Republic of',
4260 'MC': 'Monaco',
4261 'MN': 'Mongolia',
4262 'ME': 'Montenegro',
4263 'MS': 'Montserrat',
4264 'MA': 'Morocco',
4265 'MZ': 'Mozambique',
4266 'MM': 'Myanmar',
4267 'NA': 'Namibia',
4268 'NR': 'Nauru',
4269 'NP': 'Nepal',
4270 'NL': 'Netherlands',
4271 'NC': 'New Caledonia',
4272 'NZ': 'New Zealand',
4273 'NI': 'Nicaragua',
4274 'NE': 'Niger',
4275 'NG': 'Nigeria',
4276 'NU': 'Niue',
4277 'NF': 'Norfolk Island',
4278 'MP': 'Northern Mariana Islands',
4279 'NO': 'Norway',
4280 'OM': 'Oman',
4281 'PK': 'Pakistan',
4282 'PW': 'Palau',
4283 'PS': 'Palestine, State of',
4284 'PA': 'Panama',
4285 'PG': 'Papua New Guinea',
4286 'PY': 'Paraguay',
4287 'PE': 'Peru',
4288 'PH': 'Philippines',
4289 'PN': 'Pitcairn',
4290 'PL': 'Poland',
4291 'PT': 'Portugal',
4292 'PR': 'Puerto Rico',
4293 'QA': 'Qatar',
4294 'RE': 'Réunion',
4295 'RO': 'Romania',
4296 'RU': 'Russian Federation',
4297 'RW': 'Rwanda',
4298 'BL': 'Saint Barthélemy',
4299 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4300 'KN': 'Saint Kitts and Nevis',
4301 'LC': 'Saint Lucia',
4302 'MF': 'Saint Martin (French part)',
4303 'PM': 'Saint Pierre and Miquelon',
4304 'VC': 'Saint Vincent and the Grenadines',
4305 'WS': 'Samoa',
4306 'SM': 'San Marino',
4307 'ST': 'Sao Tome and Principe',
4308 'SA': 'Saudi Arabia',
4309 'SN': 'Senegal',
4310 'RS': 'Serbia',
4311 'SC': 'Seychelles',
4312 'SL': 'Sierra Leone',
4313 'SG': 'Singapore',
4314 'SX': 'Sint Maarten (Dutch part)',
4315 'SK': 'Slovakia',
4316 'SI': 'Slovenia',
4317 'SB': 'Solomon Islands',
4318 'SO': 'Somalia',
4319 'ZA': 'South Africa',
4320 'GS': 'South Georgia and the South Sandwich Islands',
4321 'SS': 'South Sudan',
4322 'ES': 'Spain',
4323 'LK': 'Sri Lanka',
4324 'SD': 'Sudan',
4325 'SR': 'Suriname',
4326 'SJ': 'Svalbard and Jan Mayen',
4327 'SZ': 'Swaziland',
4328 'SE': 'Sweden',
4329 'CH': 'Switzerland',
4330 'SY': 'Syrian Arab Republic',
4331 'TW': 'Taiwan, Province of China',
4332 'TJ': 'Tajikistan',
4333 'TZ': 'Tanzania, United Republic of',
4334 'TH': 'Thailand',
4335 'TL': 'Timor-Leste',
4336 'TG': 'Togo',
4337 'TK': 'Tokelau',
4338 'TO': 'Tonga',
4339 'TT': 'Trinidad and Tobago',
4340 'TN': 'Tunisia',
4341 'TR': 'Turkey',
4342 'TM': 'Turkmenistan',
4343 'TC': 'Turks and Caicos Islands',
4344 'TV': 'Tuvalu',
4345 'UG': 'Uganda',
4346 'UA': 'Ukraine',
4347 'AE': 'United Arab Emirates',
4348 'GB': 'United Kingdom',
4349 'US': 'United States',
4350 'UM': 'United States Minor Outlying Islands',
4351 'UY': 'Uruguay',
4352 'UZ': 'Uzbekistan',
4353 'VU': 'Vanuatu',
4354 'VE': 'Venezuela, Bolivarian Republic of',
4355 'VN': 'Viet Nam',
4356 'VG': 'Virgin Islands, British',
4357 'VI': 'Virgin Islands, U.S.',
4358 'WF': 'Wallis and Futuna',
4359 'EH': 'Western Sahara',
4360 'YE': 'Yemen',
4361 'ZM': 'Zambia',
4362 'ZW': 'Zimbabwe',
4363 # Not ISO 3166 codes, but used for IP blocks
4364 'AP': 'Asia/Pacific Region',
4365 'EU': 'Europe',
4366 }
4367
4368 @classmethod
4369 def short2full(cls, code):
4370 """Convert an ISO 3166-2 country code to the corresponding full name"""
4371 return cls._country_map.get(code.upper())
4372
4373
4374 class GeoUtils:
4375 # Major IPv4 address blocks per country
4376 _country_ip_map = {
4377 'AD': '46.172.224.0/19',
4378 'AE': '94.200.0.0/13',
4379 'AF': '149.54.0.0/17',
4380 'AG': '209.59.64.0/18',
4381 'AI': '204.14.248.0/21',
4382 'AL': '46.99.0.0/16',
4383 'AM': '46.70.0.0/15',
4384 'AO': '105.168.0.0/13',
4385 'AP': '182.50.184.0/21',
4386 'AQ': '23.154.160.0/24',
4387 'AR': '181.0.0.0/12',
4388 'AS': '202.70.112.0/20',
4389 'AT': '77.116.0.0/14',
4390 'AU': '1.128.0.0/11',
4391 'AW': '181.41.0.0/18',
4392 'AX': '185.217.4.0/22',
4393 'AZ': '5.197.0.0/16',
4394 'BA': '31.176.128.0/17',
4395 'BB': '65.48.128.0/17',
4396 'BD': '114.130.0.0/16',
4397 'BE': '57.0.0.0/8',
4398 'BF': '102.178.0.0/15',
4399 'BG': '95.42.0.0/15',
4400 'BH': '37.131.0.0/17',
4401 'BI': '154.117.192.0/18',
4402 'BJ': '137.255.0.0/16',
4403 'BL': '185.212.72.0/23',
4404 'BM': '196.12.64.0/18',
4405 'BN': '156.31.0.0/16',
4406 'BO': '161.56.0.0/16',
4407 'BQ': '161.0.80.0/20',
4408 'BR': '191.128.0.0/12',
4409 'BS': '24.51.64.0/18',
4410 'BT': '119.2.96.0/19',
4411 'BW': '168.167.0.0/16',
4412 'BY': '178.120.0.0/13',
4413 'BZ': '179.42.192.0/18',
4414 'CA': '99.224.0.0/11',
4415 'CD': '41.243.0.0/16',
4416 'CF': '197.242.176.0/21',
4417 'CG': '160.113.0.0/16',
4418 'CH': '85.0.0.0/13',
4419 'CI': '102.136.0.0/14',
4420 'CK': '202.65.32.0/19',
4421 'CL': '152.172.0.0/14',
4422 'CM': '102.244.0.0/14',
4423 'CN': '36.128.0.0/10',
4424 'CO': '181.240.0.0/12',
4425 'CR': '201.192.0.0/12',
4426 'CU': '152.206.0.0/15',
4427 'CV': '165.90.96.0/19',
4428 'CW': '190.88.128.0/17',
4429 'CY': '31.153.0.0/16',
4430 'CZ': '88.100.0.0/14',
4431 'DE': '53.0.0.0/8',
4432 'DJ': '197.241.0.0/17',
4433 'DK': '87.48.0.0/12',
4434 'DM': '192.243.48.0/20',
4435 'DO': '152.166.0.0/15',
4436 'DZ': '41.96.0.0/12',
4437 'EC': '186.68.0.0/15',
4438 'EE': '90.190.0.0/15',
4439 'EG': '156.160.0.0/11',
4440 'ER': '196.200.96.0/20',
4441 'ES': '88.0.0.0/11',
4442 'ET': '196.188.0.0/14',
4443 'EU': '2.16.0.0/13',
4444 'FI': '91.152.0.0/13',
4445 'FJ': '144.120.0.0/16',
4446 'FK': '80.73.208.0/21',
4447 'FM': '119.252.112.0/20',
4448 'FO': '88.85.32.0/19',
4449 'FR': '90.0.0.0/9',
4450 'GA': '41.158.0.0/15',
4451 'GB': '25.0.0.0/8',
4452 'GD': '74.122.88.0/21',
4453 'GE': '31.146.0.0/16',
4454 'GF': '161.22.64.0/18',
4455 'GG': '62.68.160.0/19',
4456 'GH': '154.160.0.0/12',
4457 'GI': '95.164.0.0/16',
4458 'GL': '88.83.0.0/19',
4459 'GM': '160.182.0.0/15',
4460 'GN': '197.149.192.0/18',
4461 'GP': '104.250.0.0/19',
4462 'GQ': '105.235.224.0/20',
4463 'GR': '94.64.0.0/13',
4464 'GT': '168.234.0.0/16',
4465 'GU': '168.123.0.0/16',
4466 'GW': '197.214.80.0/20',
4467 'GY': '181.41.64.0/18',
4468 'HK': '113.252.0.0/14',
4469 'HN': '181.210.0.0/16',
4470 'HR': '93.136.0.0/13',
4471 'HT': '148.102.128.0/17',
4472 'HU': '84.0.0.0/14',
4473 'ID': '39.192.0.0/10',
4474 'IE': '87.32.0.0/12',
4475 'IL': '79.176.0.0/13',
4476 'IM': '5.62.80.0/20',
4477 'IN': '117.192.0.0/10',
4478 'IO': '203.83.48.0/21',
4479 'IQ': '37.236.0.0/14',
4480 'IR': '2.176.0.0/12',
4481 'IS': '82.221.0.0/16',
4482 'IT': '79.0.0.0/10',
4483 'JE': '87.244.64.0/18',
4484 'JM': '72.27.0.0/17',
4485 'JO': '176.29.0.0/16',
4486 'JP': '133.0.0.0/8',
4487 'KE': '105.48.0.0/12',
4488 'KG': '158.181.128.0/17',
4489 'KH': '36.37.128.0/17',
4490 'KI': '103.25.140.0/22',
4491 'KM': '197.255.224.0/20',
4492 'KN': '198.167.192.0/19',
4493 'KP': '175.45.176.0/22',
4494 'KR': '175.192.0.0/10',
4495 'KW': '37.36.0.0/14',
4496 'KY': '64.96.0.0/15',
4497 'KZ': '2.72.0.0/13',
4498 'LA': '115.84.64.0/18',
4499 'LB': '178.135.0.0/16',
4500 'LC': '24.92.144.0/20',
4501 'LI': '82.117.0.0/19',
4502 'LK': '112.134.0.0/15',
4503 'LR': '102.183.0.0/16',
4504 'LS': '129.232.0.0/17',
4505 'LT': '78.56.0.0/13',
4506 'LU': '188.42.0.0/16',
4507 'LV': '46.109.0.0/16',
4508 'LY': '41.252.0.0/14',
4509 'MA': '105.128.0.0/11',
4510 'MC': '88.209.64.0/18',
4511 'MD': '37.246.0.0/16',
4512 'ME': '178.175.0.0/17',
4513 'MF': '74.112.232.0/21',
4514 'MG': '154.126.0.0/17',
4515 'MH': '117.103.88.0/21',
4516 'MK': '77.28.0.0/15',
4517 'ML': '154.118.128.0/18',
4518 'MM': '37.111.0.0/17',
4519 'MN': '49.0.128.0/17',
4520 'MO': '60.246.0.0/16',
4521 'MP': '202.88.64.0/20',
4522 'MQ': '109.203.224.0/19',
4523 'MR': '41.188.64.0/18',
4524 'MS': '208.90.112.0/22',
4525 'MT': '46.11.0.0/16',
4526 'MU': '105.16.0.0/12',
4527 'MV': '27.114.128.0/18',
4528 'MW': '102.70.0.0/15',
4529 'MX': '187.192.0.0/11',
4530 'MY': '175.136.0.0/13',
4531 'MZ': '197.218.0.0/15',
4532 'NA': '41.182.0.0/16',
4533 'NC': '101.101.0.0/18',
4534 'NE': '197.214.0.0/18',
4535 'NF': '203.17.240.0/22',
4536 'NG': '105.112.0.0/12',
4537 'NI': '186.76.0.0/15',
4538 'NL': '145.96.0.0/11',
4539 'NO': '84.208.0.0/13',
4540 'NP': '36.252.0.0/15',
4541 'NR': '203.98.224.0/19',
4542 'NU': '49.156.48.0/22',
4543 'NZ': '49.224.0.0/14',
4544 'OM': '5.36.0.0/15',
4545 'PA': '186.72.0.0/15',
4546 'PE': '186.160.0.0/14',
4547 'PF': '123.50.64.0/18',
4548 'PG': '124.240.192.0/19',
4549 'PH': '49.144.0.0/13',
4550 'PK': '39.32.0.0/11',
4551 'PL': '83.0.0.0/11',
4552 'PM': '70.36.0.0/20',
4553 'PR': '66.50.0.0/16',
4554 'PS': '188.161.0.0/16',
4555 'PT': '85.240.0.0/13',
4556 'PW': '202.124.224.0/20',
4557 'PY': '181.120.0.0/14',
4558 'QA': '37.210.0.0/15',
4559 'RE': '102.35.0.0/16',
4560 'RO': '79.112.0.0/13',
4561 'RS': '93.86.0.0/15',
4562 'RU': '5.136.0.0/13',
4563 'RW': '41.186.0.0/16',
4564 'SA': '188.48.0.0/13',
4565 'SB': '202.1.160.0/19',
4566 'SC': '154.192.0.0/11',
4567 'SD': '102.120.0.0/13',
4568 'SE': '78.64.0.0/12',
4569 'SG': '8.128.0.0/10',
4570 'SI': '188.196.0.0/14',
4571 'SK': '78.98.0.0/15',
4572 'SL': '102.143.0.0/17',
4573 'SM': '89.186.32.0/19',
4574 'SN': '41.82.0.0/15',
4575 'SO': '154.115.192.0/18',
4576 'SR': '186.179.128.0/17',
4577 'SS': '105.235.208.0/21',
4578 'ST': '197.159.160.0/19',
4579 'SV': '168.243.0.0/16',
4580 'SX': '190.102.0.0/20',
4581 'SY': '5.0.0.0/16',
4582 'SZ': '41.84.224.0/19',
4583 'TC': '65.255.48.0/20',
4584 'TD': '154.68.128.0/19',
4585 'TG': '196.168.0.0/14',
4586 'TH': '171.96.0.0/13',
4587 'TJ': '85.9.128.0/18',
4588 'TK': '27.96.24.0/21',
4589 'TL': '180.189.160.0/20',
4590 'TM': '95.85.96.0/19',
4591 'TN': '197.0.0.0/11',
4592 'TO': '175.176.144.0/21',
4593 'TR': '78.160.0.0/11',
4594 'TT': '186.44.0.0/15',
4595 'TV': '202.2.96.0/19',
4596 'TW': '120.96.0.0/11',
4597 'TZ': '156.156.0.0/14',
4598 'UA': '37.52.0.0/14',
4599 'UG': '102.80.0.0/13',
4600 'US': '6.0.0.0/8',
4601 'UY': '167.56.0.0/13',
4602 'UZ': '84.54.64.0/18',
4603 'VA': '212.77.0.0/19',
4604 'VC': '207.191.240.0/21',
4605 'VE': '186.88.0.0/13',
4606 'VG': '66.81.192.0/20',
4607 'VI': '146.226.0.0/16',
4608 'VN': '14.160.0.0/11',
4609 'VU': '202.80.32.0/20',
4610 'WF': '117.20.32.0/21',
4611 'WS': '202.4.32.0/19',
4612 'YE': '134.35.0.0/16',
4613 'YT': '41.242.116.0/22',
4614 'ZA': '41.0.0.0/11',
4615 'ZM': '102.144.0.0/13',
4616 'ZW': '102.177.192.0/18',
4617 }
4618
4619 @classmethod
4620 def random_ipv4(cls, code_or_block):
4621 if len(code_or_block) == 2:
4622 block = cls._country_ip_map.get(code_or_block.upper())
4623 if not block:
4624 return None
4625 else:
4626 block = code_or_block
4627 addr, preflen = block.split('/')
4628 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4629 addr_max = addr_min | (0xffffffff >> int(preflen))
4630 return str(socket.inet_ntoa(
4631 struct.pack('!L', random.randint(addr_min, addr_max))))
4632
4633
4634 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4635 def __init__(self, proxies=None):
4636 # Set default handlers
4637 for type in ('http', 'https'):
4638 setattr(self, '%s_open' % type,
4639 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4640 meth(r, proxy, type))
4641 urllib.request.ProxyHandler.__init__(self, proxies)
4642
4643 def proxy_open(self, req, proxy, type):
4644 req_proxy = req.headers.get('Ytdl-request-proxy')
4645 if req_proxy is not None:
4646 proxy = req_proxy
4647 del req.headers['Ytdl-request-proxy']
4648
4649 if proxy == '__noproxy__':
4650 return None # No Proxy
4651 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4652 req.add_header('Ytdl-socks-proxy', proxy)
4653 # yt-dlp's http/https handlers do wrapping the socket with socks
4654 return None
4655 return urllib.request.ProxyHandler.proxy_open(
4656 self, req, proxy, type)
4657
4658
4659 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4660 # released into Public Domain
4661 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4662
4663 def long_to_bytes(n, blocksize=0):
4664 """long_to_bytes(n:long, blocksize:int) : string
4665 Convert a long integer to a byte string.
4666
4667 If optional blocksize is given and greater than zero, pad the front of the
4668 byte string with binary zeros so that the length is a multiple of
4669 blocksize.
4670 """
4671 # after much testing, this algorithm was deemed to be the fastest
4672 s = b''
4673 n = int(n)
4674 while n > 0:
4675 s = struct.pack('>I', n & 0xffffffff) + s
4676 n = n >> 32
4677 # strip off leading zeros
4678 for i in range(len(s)):
4679 if s[i] != b'\000'[0]:
4680 break
4681 else:
4682 # only happens when n == 0
4683 s = b'\000'
4684 i = 0
4685 s = s[i:]
4686 # add back some pad bytes. this could be done more efficiently w.r.t. the
4687 # de-padding being done above, but sigh...
4688 if blocksize > 0 and len(s) % blocksize:
4689 s = (blocksize - len(s) % blocksize) * b'\000' + s
4690 return s
4691
4692
4693 def bytes_to_long(s):
4694 """bytes_to_long(string) : long
4695 Convert a byte string to a long integer.
4696
4697 This is (essentially) the inverse of long_to_bytes().
4698 """
4699 acc = 0
4700 length = len(s)
4701 if length % 4:
4702 extra = (4 - length % 4)
4703 s = b'\000' * extra + s
4704 length = length + extra
4705 for i in range(0, length, 4):
4706 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4707 return acc
4708
4709
4710 def ohdave_rsa_encrypt(data, exponent, modulus):
4711 '''
4712 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4713
4714 Input:
4715 data: data to encrypt, bytes-like object
4716 exponent, modulus: parameter e and N of RSA algorithm, both integer
4717 Output: hex string of encrypted data
4718
4719 Limitation: supports one block encryption only
4720 '''
4721
4722 payload = int(binascii.hexlify(data[::-1]), 16)
4723 encrypted = pow(payload, exponent, modulus)
4724 return '%x' % encrypted
4725
4726
4727 def pkcs1pad(data, length):
4728 """
4729 Padding input data with PKCS#1 scheme
4730
4731 @param {int[]} data input data
4732 @param {int} length target length
4733 @returns {int[]} padded data
4734 """
4735 if len(data) > length - 11:
4736 raise ValueError('Input data too long for PKCS#1 padding')
4737
4738 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4739 return [0, 2] + pseudo_random + [0] + data
4740
4741
4742 def _base_n_table(n, table):
4743 if not table and not n:
4744 raise ValueError('Either table or n must be specified')
4745 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4746
4747 if n != len(table):
4748 raise ValueError(f'base {n} exceeds table length {len(table)}')
4749 return table
4750
4751
4752 def encode_base_n(num, n=None, table=None):
4753 """Convert given int to a base-n string"""
4754 table = _base_n_table(n, table)
4755 if not num:
4756 return table[0]
4757
4758 result, base = '', len(table)
4759 while num:
4760 result = table[num % base] + result
4761 num = num // base
4762 return result
4763
4764
4765 def decode_base_n(string, n=None, table=None):
4766 """Convert given base-n string to int"""
4767 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4768 result, base = 0, len(table)
4769 for char in string:
4770 result = result * base + table[char]
4771 return result
4772
4773
4774 def decode_base(value, digits):
4775 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4776 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4777 return decode_base_n(value, table=digits)
4778
4779
4780 def decode_packed_codes(code):
4781 mobj = re.search(PACKED_CODES_RE, code)
4782 obfuscated_code, base, count, symbols = mobj.groups()
4783 base = int(base)
4784 count = int(count)
4785 symbols = symbols.split('|')
4786 symbol_table = {}
4787
4788 while count:
4789 count -= 1
4790 base_n_count = encode_base_n(count, base)
4791 symbol_table[base_n_count] = symbols[count] or base_n_count
4792
4793 return re.sub(
4794 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4795 obfuscated_code)
4796
4797
4798 def caesar(s, alphabet, shift):
4799 if shift == 0:
4800 return s
4801 l = len(alphabet)
4802 return ''.join(
4803 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4804 for c in s)
4805
4806
4807 def rot47(s):
4808 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4809
4810
4811 def parse_m3u8_attributes(attrib):
4812 info = {}
4813 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4814 if val.startswith('"'):
4815 val = val[1:-1]
4816 info[key] = val
4817 return info
4818
4819
4820 def urshift(val, n):
4821 return val >> n if val >= 0 else (val + 0x100000000) >> n
4822
4823
4824 # Based on png2str() written by @gdkchan and improved by @yokrysty
4825 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4826 def decode_png(png_data):
4827 # Reference: https://www.w3.org/TR/PNG/
4828 header = png_data[8:]
4829
4830 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4831 raise OSError('Not a valid PNG file.')
4832
4833 int_map = {1: '>B', 2: '>H', 4: '>I'}
4834 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4835
4836 chunks = []
4837
4838 while header:
4839 length = unpack_integer(header[:4])
4840 header = header[4:]
4841
4842 chunk_type = header[:4]
4843 header = header[4:]
4844
4845 chunk_data = header[:length]
4846 header = header[length:]
4847
4848 header = header[4:] # Skip CRC
4849
4850 chunks.append({
4851 'type': chunk_type,
4852 'length': length,
4853 'data': chunk_data
4854 })
4855
4856 ihdr = chunks[0]['data']
4857
4858 width = unpack_integer(ihdr[:4])
4859 height = unpack_integer(ihdr[4:8])
4860
4861 idat = b''
4862
4863 for chunk in chunks:
4864 if chunk['type'] == b'IDAT':
4865 idat += chunk['data']
4866
4867 if not idat:
4868 raise OSError('Unable to read PNG data.')
4869
4870 decompressed_data = bytearray(zlib.decompress(idat))
4871
4872 stride = width * 3
4873 pixels = []
4874
4875 def _get_pixel(idx):
4876 x = idx % stride
4877 y = idx // stride
4878 return pixels[y][x]
4879
4880 for y in range(height):
4881 basePos = y * (1 + stride)
4882 filter_type = decompressed_data[basePos]
4883
4884 current_row = []
4885
4886 pixels.append(current_row)
4887
4888 for x in range(stride):
4889 color = decompressed_data[1 + basePos + x]
4890 basex = y * stride + x
4891 left = 0
4892 up = 0
4893
4894 if x > 2:
4895 left = _get_pixel(basex - 3)
4896 if y > 0:
4897 up = _get_pixel(basex - stride)
4898
4899 if filter_type == 1: # Sub
4900 color = (color + left) & 0xff
4901 elif filter_type == 2: # Up
4902 color = (color + up) & 0xff
4903 elif filter_type == 3: # Average
4904 color = (color + ((left + up) >> 1)) & 0xff
4905 elif filter_type == 4: # Paeth
4906 a = left
4907 b = up
4908 c = 0
4909
4910 if x > 2 and y > 0:
4911 c = _get_pixel(basex - stride - 3)
4912
4913 p = a + b - c
4914
4915 pa = abs(p - a)
4916 pb = abs(p - b)
4917 pc = abs(p - c)
4918
4919 if pa <= pb and pa <= pc:
4920 color = (color + a) & 0xff
4921 elif pb <= pc:
4922 color = (color + b) & 0xff
4923 else:
4924 color = (color + c) & 0xff
4925
4926 current_row.append(color)
4927
4928 return width, height, pixels
4929
4930
4931 def write_xattr(path, key, value):
4932 # Windows: Write xattrs to NTFS Alternate Data Streams:
4933 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4934 if compat_os_name == 'nt':
4935 assert ':' not in key
4936 assert os.path.exists(path)
4937
4938 try:
4939 with open(f'{path}:{key}', 'wb') as f:
4940 f.write(value)
4941 except OSError as e:
4942 raise XAttrMetadataError(e.errno, e.strerror)
4943 return
4944
4945 # UNIX Method 1. Use xattrs/pyxattrs modules
4946
4947 setxattr = None
4948 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4949 # Unicode arguments are not supported in pyxattr until version 0.5.0
4950 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4951 if version_tuple(xattr.__version__) >= (0, 5, 0):
4952 setxattr = xattr.set
4953 elif xattr:
4954 setxattr = xattr.setxattr
4955
4956 if setxattr:
4957 try:
4958 setxattr(path, key, value)
4959 except OSError as e:
4960 raise XAttrMetadataError(e.errno, e.strerror)
4961 return
4962
4963 # UNIX Method 2. Use setfattr/xattr executables
4964 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4965 else 'xattr' if check_executable('xattr', ['-h']) else None)
4966 if not exe:
4967 raise XAttrUnavailableError(
4968 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4969 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4970
4971 value = value.decode()
4972 try:
4973 _, stderr, returncode = Popen.run(
4974 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4975 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4976 except OSError as e:
4977 raise XAttrMetadataError(e.errno, e.strerror)
4978 if returncode:
4979 raise XAttrMetadataError(returncode, stderr)
4980
4981
4982 def random_birthday(year_field, month_field, day_field):
4983 start_date = datetime.date(1950, 1, 1)
4984 end_date = datetime.date(1995, 12, 31)
4985 offset = random.randint(0, (end_date - start_date).days)
4986 random_date = start_date + datetime.timedelta(offset)
4987 return {
4988 year_field: str(random_date.year),
4989 month_field: str(random_date.month),
4990 day_field: str(random_date.day),
4991 }
4992
4993
4994 # Templates for internet shortcut files, which are plain text files.
4995 DOT_URL_LINK_TEMPLATE = '''\
4996 [InternetShortcut]
4997 URL=%(url)s
4998 '''
4999
5000 DOT_WEBLOC_LINK_TEMPLATE = '''\
5001 <?xml version="1.0" encoding="UTF-8"?>
5002 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5003 <plist version="1.0">
5004 <dict>
5005 \t<key>URL</key>
5006 \t<string>%(url)s</string>
5007 </dict>
5008 </plist>
5009 '''
5010
5011 DOT_DESKTOP_LINK_TEMPLATE = '''\
5012 [Desktop Entry]
5013 Encoding=UTF-8
5014 Name=%(filename)s
5015 Type=Link
5016 URL=%(url)s
5017 Icon=text-html
5018 '''
5019
5020 LINK_TEMPLATES = {
5021 'url': DOT_URL_LINK_TEMPLATE,
5022 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5023 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5024 }
5025
5026
5027 def iri_to_uri(iri):
5028 """
5029 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5030
5031 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5032 """
5033
5034 iri_parts = urllib.parse.urlparse(iri)
5035
5036 if '[' in iri_parts.netloc:
5037 raise ValueError('IPv6 URIs are not, yet, supported.')
5038 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5039
5040 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5041
5042 net_location = ''
5043 if iri_parts.username:
5044 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5045 if iri_parts.password is not None:
5046 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5047 net_location += '@'
5048
5049 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5050 # The 'idna' encoding produces ASCII text.
5051 if iri_parts.port is not None and iri_parts.port != 80:
5052 net_location += ':' + str(iri_parts.port)
5053
5054 return urllib.parse.urlunparse(
5055 (iri_parts.scheme,
5056 net_location,
5057
5058 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5059
5060 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5061 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5062
5063 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5064 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5065
5066 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5067
5068 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5069
5070
5071 def to_high_limit_path(path):
5072 if sys.platform in ['win32', 'cygwin']:
5073 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5074 return '\\\\?\\' + os.path.abspath(path)
5075
5076 return path
5077
5078
5079 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5080 val = traverse_obj(obj, *variadic(field))
5081 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5082 return default
5083 return template % func(val)
5084
5085
5086 def clean_podcast_url(url):
5087 return re.sub(r'''(?x)
5088 (?:
5089 (?:
5090 chtbl\.com/track|
5091 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5092 play\.podtrac\.com
5093 )/[^/]+|
5094 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5095 flex\.acast\.com|
5096 pd(?:
5097 cn\.co| # https://podcorn.com/analytics-prefix/
5098 st\.fm # https://podsights.com/docs/
5099 )/e
5100 )/''', '', url)
5101
5102
5103 _HEX_TABLE = '0123456789abcdef'
5104
5105
5106 def random_uuidv4():
5107 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5108
5109
5110 def make_dir(path, to_screen=None):
5111 try:
5112 dn = os.path.dirname(path)
5113 if dn and not os.path.exists(dn):
5114 os.makedirs(dn)
5115 return True
5116 except OSError as err:
5117 if callable(to_screen) is not None:
5118 to_screen('unable to create directory ' + error_to_compat_str(err))
5119 return False
5120
5121
5122 def get_executable_path():
5123 from .update import _get_variant_and_executable_path
5124
5125 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5126
5127
5128 def load_plugins(name, suffix, namespace):
5129 classes = {}
5130 with contextlib.suppress(FileNotFoundError):
5131 plugins_spec = importlib.util.spec_from_file_location(
5132 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5133 plugins = importlib.util.module_from_spec(plugins_spec)
5134 sys.modules[plugins_spec.name] = plugins
5135 plugins_spec.loader.exec_module(plugins)
5136 for name in dir(plugins):
5137 if name in namespace:
5138 continue
5139 if not name.endswith(suffix):
5140 continue
5141 klass = getattr(plugins, name)
5142 classes[name] = namespace[name] = klass
5143 return classes
5144
5145
5146 def traverse_obj(
5147 obj, *path_list, default=None, expected_type=None, get_all=True,
5148 casesense=True, is_user_input=False, traverse_string=False):
5149 ''' Traverse nested list/dict/tuple
5150 @param path_list A list of paths which are checked one by one.
5151 Each path is a list of keys where each key is a:
5152 - None: Do nothing
5153 - string: A dictionary key
5154 - int: An index into a list
5155 - tuple: A list of keys all of which will be traversed
5156 - Ellipsis: Fetch all values in the object
5157 - Function: Takes the key and value as arguments
5158 and returns whether the key matches or not
5159 @param default Default value to return
5160 @param expected_type Only accept final value of this type (Can also be any callable)
5161 @param get_all Return all the values obtained from a path or only the first one
5162 @param casesense Whether to consider dictionary keys as case sensitive
5163 @param is_user_input Whether the keys are generated from user input. If True,
5164 strings are converted to int/slice if necessary
5165 @param traverse_string Whether to traverse inside strings. If True, any
5166 non-compatible object will also be converted into a string
5167 # TODO: Write tests
5168 '''
5169 if not casesense:
5170 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5171 path_list = (map(_lower, variadic(path)) for path in path_list)
5172
5173 def _traverse_obj(obj, path, _current_depth=0):
5174 nonlocal depth
5175 path = tuple(variadic(path))
5176 for i, key in enumerate(path):
5177 if None in (key, obj):
5178 return obj
5179 if isinstance(key, (list, tuple)):
5180 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5181 key = ...
5182 if key is ...:
5183 obj = (obj.values() if isinstance(obj, dict)
5184 else obj if isinstance(obj, (list, tuple, LazyList))
5185 else str(obj) if traverse_string else [])
5186 _current_depth += 1
5187 depth = max(depth, _current_depth)
5188 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5189 elif callable(key):
5190 if isinstance(obj, (list, tuple, LazyList)):
5191 obj = enumerate(obj)
5192 elif isinstance(obj, dict):
5193 obj = obj.items()
5194 else:
5195 if not traverse_string:
5196 return None
5197 obj = str(obj)
5198 _current_depth += 1
5199 depth = max(depth, _current_depth)
5200 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5201 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5202 obj = (obj.get(key) if casesense or (key in obj)
5203 else next((v for k, v in obj.items() if _lower(k) == key), None))
5204 else:
5205 if is_user_input:
5206 key = (int_or_none(key) if ':' not in key
5207 else slice(*map(int_or_none, key.split(':'))))
5208 if key == slice(None):
5209 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5210 if not isinstance(key, (int, slice)):
5211 return None
5212 if not isinstance(obj, (list, tuple, LazyList)):
5213 if not traverse_string:
5214 return None
5215 obj = str(obj)
5216 try:
5217 obj = obj[key]
5218 except IndexError:
5219 return None
5220 return obj
5221
5222 if isinstance(expected_type, type):
5223 type_test = lambda val: val if isinstance(val, expected_type) else None
5224 else:
5225 type_test = expected_type or IDENTITY
5226
5227 for path in path_list:
5228 depth = 0
5229 val = _traverse_obj(obj, path)
5230 if val is not None:
5231 if depth:
5232 for _ in range(depth - 1):
5233 val = itertools.chain.from_iterable(v for v in val if v is not None)
5234 val = [v for v in map(type_test, val) if v is not None]
5235 if val:
5236 return val if get_all else val[0]
5237 else:
5238 val = type_test(val)
5239 if val is not None:
5240 return val
5241 return default
5242
5243
5244 def traverse_dict(dictn, keys, casesense=True):
5245 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5246 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5247 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5248
5249
5250 def get_first(obj, keys, **kwargs):
5251 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5252
5253
5254 def variadic(x, allowed_types=(str, bytes, dict)):
5255 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5256
5257
5258 def time_seconds(**kwargs):
5259 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5260 return t.timestamp()
5261
5262
5263 # create a JSON Web Signature (jws) with HS256 algorithm
5264 # the resulting format is in JWS Compact Serialization
5265 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5266 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5267 def jwt_encode_hs256(payload_data, key, headers={}):
5268 header_data = {
5269 'alg': 'HS256',
5270 'typ': 'JWT',
5271 }
5272 if headers:
5273 header_data.update(headers)
5274 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5275 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5276 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5277 signature_b64 = base64.b64encode(h.digest())
5278 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5279 return token
5280
5281
5282 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5283 def jwt_decode_hs256(jwt):
5284 header_b64, payload_b64, signature_b64 = jwt.split('.')
5285 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5286 return payload_data
5287
5288
5289 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5290
5291
5292 @functools.cache
5293 def supports_terminal_sequences(stream):
5294 if compat_os_name == 'nt':
5295 if not WINDOWS_VT_MODE:
5296 return False
5297 elif not os.getenv('TERM'):
5298 return False
5299 try:
5300 return stream.isatty()
5301 except BaseException:
5302 return False
5303
5304
5305 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5306 if get_windows_version() < (10, 0, 10586):
5307 return
5308 global WINDOWS_VT_MODE
5309 try:
5310 Popen.run('', shell=True)
5311 except Exception:
5312 return
5313
5314 WINDOWS_VT_MODE = True
5315 supports_terminal_sequences.cache_clear()
5316
5317
5318 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5319
5320
5321 def remove_terminal_sequences(string):
5322 return _terminal_sequences_re.sub('', string)
5323
5324
5325 def number_of_digits(number):
5326 return len('%d' % number)
5327
5328
5329 def join_nonempty(*values, delim='-', from_dict=None):
5330 if from_dict is not None:
5331 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5332 return delim.join(map(str, filter(None, values)))
5333
5334
5335 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5336 """
5337 Find the largest format dimensions in terms of video width and, for each thumbnail:
5338 * Modify the URL: Match the width with the provided regex and replace with the former width
5339 * Update dimensions
5340
5341 This function is useful with video services that scale the provided thumbnails on demand
5342 """
5343 _keys = ('width', 'height')
5344 max_dimensions = max(
5345 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5346 default=(0, 0))
5347 if not max_dimensions[0]:
5348 return thumbnails
5349 return [
5350 merge_dicts(
5351 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5352 dict(zip(_keys, max_dimensions)), thumbnail)
5353 for thumbnail in thumbnails
5354 ]
5355
5356
5357 def parse_http_range(range):
5358 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5359 if not range:
5360 return None, None, None
5361 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5362 if not crg:
5363 return None, None, None
5364 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5365
5366
5367 def read_stdin(what):
5368 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5369 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5370 return sys.stdin
5371
5372
5373 class Config:
5374 own_args = None
5375 parsed_args = None
5376 filename = None
5377 __initialized = False
5378
5379 def __init__(self, parser, label=None):
5380 self.parser, self.label = parser, label
5381 self._loaded_paths, self.configs = set(), []
5382
5383 def init(self, args=None, filename=None):
5384 assert not self.__initialized
5385 directory = ''
5386 if filename:
5387 location = os.path.realpath(filename)
5388 directory = os.path.dirname(location)
5389 if location in self._loaded_paths:
5390 return False
5391 self._loaded_paths.add(location)
5392
5393 self.own_args, self.__initialized = args, True
5394 opts, _ = self.parser.parse_known_args(args)
5395 self.parsed_args, self.filename = args, filename
5396
5397 for location in opts.config_locations or []:
5398 if location == '-':
5399 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5400 continue
5401 location = os.path.join(directory, expand_path(location))
5402 if os.path.isdir(location):
5403 location = os.path.join(location, 'yt-dlp.conf')
5404 if not os.path.exists(location):
5405 self.parser.error(f'config location {location} does not exist')
5406 self.append_config(self.read_file(location), location)
5407 return True
5408
5409 def __str__(self):
5410 label = join_nonempty(
5411 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5412 delim=' ')
5413 return join_nonempty(
5414 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5415 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5416 delim='\n')
5417
5418 @staticmethod
5419 def read_file(filename, default=[]):
5420 try:
5421 optionf = open(filename)
5422 except OSError:
5423 return default # silently skip if file is not present
5424 try:
5425 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5426 contents = optionf.read()
5427 res = shlex.split(contents, comments=True)
5428 except Exception as err:
5429 raise ValueError(f'Unable to parse "{filename}": {err}')
5430 finally:
5431 optionf.close()
5432 return res
5433
5434 @staticmethod
5435 def hide_login_info(opts):
5436 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5437 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5438
5439 def _scrub_eq(o):
5440 m = eqre.match(o)
5441 if m:
5442 return m.group('key') + '=PRIVATE'
5443 else:
5444 return o
5445
5446 opts = list(map(_scrub_eq, opts))
5447 for idx, opt in enumerate(opts):
5448 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5449 opts[idx + 1] = 'PRIVATE'
5450 return opts
5451
5452 def append_config(self, *args, label=None):
5453 config = type(self)(self.parser, label)
5454 config._loaded_paths = self._loaded_paths
5455 if config.init(*args):
5456 self.configs.append(config)
5457
5458 @property
5459 def all_args(self):
5460 for config in reversed(self.configs):
5461 yield from config.all_args
5462 yield from self.parsed_args or []
5463
5464 def parse_known_args(self, **kwargs):
5465 return self.parser.parse_known_args(self.all_args, **kwargs)
5466
5467 def parse_args(self):
5468 return self.parser.parse_args(self.all_args)
5469
5470
5471 class WebSocketsWrapper():
5472 """Wraps websockets module to use in non-async scopes"""
5473 pool = None
5474
5475 def __init__(self, url, headers=None, connect=True):
5476 self.loop = asyncio.new_event_loop()
5477 # XXX: "loop" is deprecated
5478 self.conn = websockets.connect(
5479 url, extra_headers=headers, ping_interval=None,
5480 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5481 if connect:
5482 self.__enter__()
5483 atexit.register(self.__exit__, None, None, None)
5484
5485 def __enter__(self):
5486 if not self.pool:
5487 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5488 return self
5489
5490 def send(self, *args):
5491 self.run_with_loop(self.pool.send(*args), self.loop)
5492
5493 def recv(self, *args):
5494 return self.run_with_loop(self.pool.recv(*args), self.loop)
5495
5496 def __exit__(self, type, value, traceback):
5497 try:
5498 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5499 finally:
5500 self.loop.close()
5501 self._cancel_all_tasks(self.loop)
5502
5503 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5504 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5505 @staticmethod
5506 def run_with_loop(main, loop):
5507 if not asyncio.iscoroutine(main):
5508 raise ValueError(f'a coroutine was expected, got {main!r}')
5509
5510 try:
5511 return loop.run_until_complete(main)
5512 finally:
5513 loop.run_until_complete(loop.shutdown_asyncgens())
5514 if hasattr(loop, 'shutdown_default_executor'):
5515 loop.run_until_complete(loop.shutdown_default_executor())
5516
5517 @staticmethod
5518 def _cancel_all_tasks(loop):
5519 to_cancel = asyncio.all_tasks(loop)
5520
5521 if not to_cancel:
5522 return
5523
5524 for task in to_cancel:
5525 task.cancel()
5526
5527 # XXX: "loop" is removed in python 3.10+
5528 loop.run_until_complete(
5529 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5530
5531 for task in to_cancel:
5532 if task.cancelled():
5533 continue
5534 if task.exception() is not None:
5535 loop.call_exception_handler({
5536 'message': 'unhandled exception during asyncio.run() shutdown',
5537 'exception': task.exception(),
5538 'task': task,
5539 })
5540
5541
5542 def merge_headers(*dicts):
5543 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5544 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5545
5546
5547 class classproperty:
5548 """classmethod(property(func)) that works in py < 3.9"""
5549
5550 def __init__(self, func):
5551 functools.update_wrapper(self, func)
5552 self.func = func
5553
5554 def __get__(self, _, cls):
5555 return self.func(cls)
5556
5557
5558 class Namespace(types.SimpleNamespace):
5559 """Immutable namespace"""
5560
5561 def __iter__(self):
5562 return iter(self.__dict__.values())
5563
5564 @property
5565 def items_(self):
5566 return self.__dict__.items()
5567
5568
5569 # Deprecated
5570 has_certifi = bool(certifi)
5571 has_websockets = bool(websockets)