]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[cleanup] Misc cleanup
[yt-dlp.git] / yt_dlp / utils.py
1 import atexit
2 import base64
3 import binascii
4 import calendar
5 import codecs
6 import collections
7 import contextlib
8 import ctypes
9 import datetime
10 import email.header
11 import email.utils
12 import errno
13 import gzip
14 import hashlib
15 import hmac
16 import html.entities
17 import html.parser
18 import http.client
19 import http.cookiejar
20 import importlib.util
21 import inspect
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import mimetypes
28 import operator
29 import os
30 import platform
31 import random
32 import re
33 import shlex
34 import socket
35 import ssl
36 import struct
37 import subprocess
38 import sys
39 import tempfile
40 import time
41 import traceback
42 import types
43 import urllib.error
44 import urllib.parse
45 import urllib.request
46 import xml.etree.ElementTree
47 import zlib
48
49 from .compat import asyncio, functools # isort: split
50 from .compat import (
51 compat_etree_fromstring,
52 compat_expanduser,
53 compat_HTMLParseError,
54 compat_os_name,
55 compat_shlex_quote,
56 )
57 from .dependencies import brotli, certifi, websockets, xattr
58 from .socks import ProxyType, sockssocket
59
60
61 def register_socks_protocols():
62 # "Register" SOCKS protocols
63 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
64 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
65 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
66 if scheme not in urllib.parse.uses_netloc:
67 urllib.parse.uses_netloc.append(scheme)
68
69
70 # This is not clearly defined otherwise
71 compiled_regex_type = type(re.compile(''))
72
73
74 def random_user_agent():
75 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
76 _CHROME_VERSIONS = (
77 '90.0.4430.212',
78 '90.0.4430.24',
79 '90.0.4430.70',
80 '90.0.4430.72',
81 '90.0.4430.85',
82 '90.0.4430.93',
83 '91.0.4472.101',
84 '91.0.4472.106',
85 '91.0.4472.114',
86 '91.0.4472.124',
87 '91.0.4472.164',
88 '91.0.4472.19',
89 '91.0.4472.77',
90 '92.0.4515.107',
91 '92.0.4515.115',
92 '92.0.4515.131',
93 '92.0.4515.159',
94 '92.0.4515.43',
95 '93.0.4556.0',
96 '93.0.4577.15',
97 '93.0.4577.63',
98 '93.0.4577.82',
99 '94.0.4606.41',
100 '94.0.4606.54',
101 '94.0.4606.61',
102 '94.0.4606.71',
103 '94.0.4606.81',
104 '94.0.4606.85',
105 '95.0.4638.17',
106 '95.0.4638.50',
107 '95.0.4638.54',
108 '95.0.4638.69',
109 '95.0.4638.74',
110 '96.0.4664.18',
111 '96.0.4664.45',
112 '96.0.4664.55',
113 '96.0.4664.93',
114 '97.0.4692.20',
115 )
116 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
117
118
119 SUPPORTED_ENCODINGS = [
120 'gzip', 'deflate'
121 ]
122 if brotli:
123 SUPPORTED_ENCODINGS.append('br')
124
125 std_headers = {
126 'User-Agent': random_user_agent(),
127 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
128 'Accept-Language': 'en-us,en;q=0.5',
129 'Sec-Fetch-Mode': 'navigate',
130 }
131
132
133 USER_AGENTS = {
134 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
135 }
136
137
138 NO_DEFAULT = object()
139 IDENTITY = lambda x: x
140
141 ENGLISH_MONTH_NAMES = [
142 'January', 'February', 'March', 'April', 'May', 'June',
143 'July', 'August', 'September', 'October', 'November', 'December']
144
145 MONTH_NAMES = {
146 'en': ENGLISH_MONTH_NAMES,
147 'fr': [
148 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
149 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
150 }
151
152 KNOWN_EXTENSIONS = (
153 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
154 'flv', 'f4v', 'f4a', 'f4b',
155 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
156 'mkv', 'mka', 'mk3d',
157 'avi', 'divx',
158 'mov',
159 'asf', 'wmv', 'wma',
160 '3gp', '3g2',
161 'mp3',
162 'flac',
163 'ape',
164 'wav',
165 'f4f', 'f4m', 'm3u8', 'smil')
166
167 # needed for sanitizing filenames in restricted mode
168 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
169 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
170 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
171
172 DATE_FORMATS = (
173 '%d %B %Y',
174 '%d %b %Y',
175 '%B %d %Y',
176 '%B %dst %Y',
177 '%B %dnd %Y',
178 '%B %drd %Y',
179 '%B %dth %Y',
180 '%b %d %Y',
181 '%b %dst %Y',
182 '%b %dnd %Y',
183 '%b %drd %Y',
184 '%b %dth %Y',
185 '%b %dst %Y %I:%M',
186 '%b %dnd %Y %I:%M',
187 '%b %drd %Y %I:%M',
188 '%b %dth %Y %I:%M',
189 '%Y %m %d',
190 '%Y-%m-%d',
191 '%Y.%m.%d.',
192 '%Y/%m/%d',
193 '%Y/%m/%d %H:%M',
194 '%Y/%m/%d %H:%M:%S',
195 '%Y%m%d%H%M',
196 '%Y%m%d%H%M%S',
197 '%Y%m%d',
198 '%Y-%m-%d %H:%M',
199 '%Y-%m-%d %H:%M:%S',
200 '%Y-%m-%d %H:%M:%S.%f',
201 '%Y-%m-%d %H:%M:%S:%f',
202 '%d.%m.%Y %H:%M',
203 '%d.%m.%Y %H.%M',
204 '%Y-%m-%dT%H:%M:%SZ',
205 '%Y-%m-%dT%H:%M:%S.%fZ',
206 '%Y-%m-%dT%H:%M:%S.%f0Z',
207 '%Y-%m-%dT%H:%M:%S',
208 '%Y-%m-%dT%H:%M:%S.%f',
209 '%Y-%m-%dT%H:%M',
210 '%b %d %Y at %H:%M',
211 '%b %d %Y at %H:%M:%S',
212 '%B %d %Y at %H:%M',
213 '%B %d %Y at %H:%M:%S',
214 '%H:%M %d-%b-%Y',
215 )
216
217 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
218 DATE_FORMATS_DAY_FIRST.extend([
219 '%d-%m-%Y',
220 '%d.%m.%Y',
221 '%d.%m.%y',
222 '%d/%m/%Y',
223 '%d/%m/%y',
224 '%d/%m/%Y %H:%M:%S',
225 ])
226
227 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
228 DATE_FORMATS_MONTH_FIRST.extend([
229 '%m-%d-%Y',
230 '%m.%d.%Y',
231 '%m/%d/%Y',
232 '%m/%d/%y',
233 '%m/%d/%Y %H:%M:%S',
234 ])
235
236 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
237 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
238
239 NUMBER_RE = r'\d+(?:\.\d+)?'
240
241
242 @functools.cache
243 def preferredencoding():
244 """Get preferred encoding.
245
246 Returns the best encoding scheme for the system, based on
247 locale.getpreferredencoding() and some further tweaks.
248 """
249 try:
250 pref = locale.getpreferredencoding()
251 'TEST'.encode(pref)
252 except Exception:
253 pref = 'UTF-8'
254
255 return pref
256
257
258 def write_json_file(obj, fn):
259 """ Encode obj as JSON and write it to fn, atomically if possible """
260
261 tf = tempfile.NamedTemporaryFile(
262 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
263 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
264
265 try:
266 with tf:
267 json.dump(obj, tf, ensure_ascii=False)
268 if sys.platform == 'win32':
269 # Need to remove existing file on Windows, else os.rename raises
270 # WindowsError or FileExistsError.
271 with contextlib.suppress(OSError):
272 os.unlink(fn)
273 with contextlib.suppress(OSError):
274 mask = os.umask(0)
275 os.umask(mask)
276 os.chmod(tf.name, 0o666 & ~mask)
277 os.rename(tf.name, fn)
278 except Exception:
279 with contextlib.suppress(OSError):
280 os.remove(tf.name)
281 raise
282
283
284 def find_xpath_attr(node, xpath, key, val=None):
285 """ Find the xpath xpath[@key=val] """
286 assert re.match(r'^[a-zA-Z_-]+$', key)
287 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
288 return node.find(expr)
289
290 # On python2.6 the xml.etree.ElementTree.Element methods don't support
291 # the namespace parameter
292
293
294 def xpath_with_ns(path, ns_map):
295 components = [c.split(':') for c in path.split('/')]
296 replaced = []
297 for c in components:
298 if len(c) == 1:
299 replaced.append(c[0])
300 else:
301 ns, tag = c
302 replaced.append('{%s}%s' % (ns_map[ns], tag))
303 return '/'.join(replaced)
304
305
306 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
307 def _find_xpath(xpath):
308 return node.find(xpath)
309
310 if isinstance(xpath, str):
311 n = _find_xpath(xpath)
312 else:
313 for xp in xpath:
314 n = _find_xpath(xp)
315 if n is not None:
316 break
317
318 if n is None:
319 if default is not NO_DEFAULT:
320 return default
321 elif fatal:
322 name = xpath if name is None else name
323 raise ExtractorError('Could not find XML element %s' % name)
324 else:
325 return None
326 return n
327
328
329 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
330 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
331 if n is None or n == default:
332 return n
333 if n.text is None:
334 if default is not NO_DEFAULT:
335 return default
336 elif fatal:
337 name = xpath if name is None else name
338 raise ExtractorError('Could not find XML element\'s text %s' % name)
339 else:
340 return None
341 return n.text
342
343
344 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
345 n = find_xpath_attr(node, xpath, key)
346 if n is None:
347 if default is not NO_DEFAULT:
348 return default
349 elif fatal:
350 name = f'{xpath}[@{key}]' if name is None else name
351 raise ExtractorError('Could not find XML attribute %s' % name)
352 else:
353 return None
354 return n.attrib[key]
355
356
357 def get_element_by_id(id, html, **kwargs):
358 """Return the content of the tag with the specified ID in the passed HTML document"""
359 return get_element_by_attribute('id', id, html, **kwargs)
360
361
362 def get_element_html_by_id(id, html, **kwargs):
363 """Return the html of the tag with the specified ID in the passed HTML document"""
364 return get_element_html_by_attribute('id', id, html, **kwargs)
365
366
367 def get_element_by_class(class_name, html):
368 """Return the content of the first tag with the specified class in the passed HTML document"""
369 retval = get_elements_by_class(class_name, html)
370 return retval[0] if retval else None
371
372
373 def get_element_html_by_class(class_name, html):
374 """Return the html of the first tag with the specified class in the passed HTML document"""
375 retval = get_elements_html_by_class(class_name, html)
376 return retval[0] if retval else None
377
378
379 def get_element_by_attribute(attribute, value, html, **kwargs):
380 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
381 return retval[0] if retval else None
382
383
384 def get_element_html_by_attribute(attribute, value, html, **kargs):
385 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
386 return retval[0] if retval else None
387
388
389 def get_elements_by_class(class_name, html, **kargs):
390 """Return the content of all tags with the specified class in the passed HTML document as a list"""
391 return get_elements_by_attribute(
392 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
393 html, escape_value=False)
394
395
396 def get_elements_html_by_class(class_name, html):
397 """Return the html of all tags with the specified class in the passed HTML document as a list"""
398 return get_elements_html_by_attribute(
399 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
400 html, escape_value=False)
401
402
403 def get_elements_by_attribute(*args, **kwargs):
404 """Return the content of the tag with the specified attribute in the passed HTML document"""
405 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
406
407
408 def get_elements_html_by_attribute(*args, **kwargs):
409 """Return the html of the tag with the specified attribute in the passed HTML document"""
410 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
411
412
413 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
414 """
415 Return the text (content) and the html (whole) of the tag with the specified
416 attribute in the passed HTML document
417 """
418
419 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
420
421 value = re.escape(value) if escape_value else value
422
423 partial_element_re = rf'''(?x)
424 <(?P<tag>[a-zA-Z0-9:._-]+)
425 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
426 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
427 '''
428
429 for m in re.finditer(partial_element_re, html):
430 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
431
432 yield (
433 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
434 whole
435 )
436
437
438 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
439 """
440 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
441 closing tag for the first opening tag it has encountered, and can be used
442 as a context manager
443 """
444
445 class HTMLBreakOnClosingTagException(Exception):
446 pass
447
448 def __init__(self):
449 self.tagstack = collections.deque()
450 html.parser.HTMLParser.__init__(self)
451
452 def __enter__(self):
453 return self
454
455 def __exit__(self, *_):
456 self.close()
457
458 def close(self):
459 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
460 # so data remains buffered; we no longer have any interest in it, thus
461 # override this method to discard it
462 pass
463
464 def handle_starttag(self, tag, _):
465 self.tagstack.append(tag)
466
467 def handle_endtag(self, tag):
468 if not self.tagstack:
469 raise compat_HTMLParseError('no tags in the stack')
470 while self.tagstack:
471 inner_tag = self.tagstack.pop()
472 if inner_tag == tag:
473 break
474 else:
475 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
476 if not self.tagstack:
477 raise self.HTMLBreakOnClosingTagException()
478
479
480 def get_element_text_and_html_by_tag(tag, html):
481 """
482 For the first element with the specified tag in the passed HTML document
483 return its' content (text) and the whole element (html)
484 """
485 def find_or_raise(haystack, needle, exc):
486 try:
487 return haystack.index(needle)
488 except ValueError:
489 raise exc
490 closing_tag = f'</{tag}>'
491 whole_start = find_or_raise(
492 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
493 content_start = find_or_raise(
494 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
495 content_start += whole_start + 1
496 with HTMLBreakOnClosingTagParser() as parser:
497 parser.feed(html[whole_start:content_start])
498 if not parser.tagstack or parser.tagstack[0] != tag:
499 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
500 offset = content_start
501 while offset < len(html):
502 next_closing_tag_start = find_or_raise(
503 html[offset:], closing_tag,
504 compat_HTMLParseError(f'closing {tag} tag not found'))
505 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
506 try:
507 parser.feed(html[offset:offset + next_closing_tag_end])
508 offset += next_closing_tag_end
509 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
510 return html[content_start:offset + next_closing_tag_start], \
511 html[whole_start:offset + next_closing_tag_end]
512 raise compat_HTMLParseError('unexpected end of html')
513
514
515 class HTMLAttributeParser(html.parser.HTMLParser):
516 """Trivial HTML parser to gather the attributes for a single element"""
517
518 def __init__(self):
519 self.attrs = {}
520 html.parser.HTMLParser.__init__(self)
521
522 def handle_starttag(self, tag, attrs):
523 self.attrs = dict(attrs)
524
525
526 class HTMLListAttrsParser(html.parser.HTMLParser):
527 """HTML parser to gather the attributes for the elements of a list"""
528
529 def __init__(self):
530 html.parser.HTMLParser.__init__(self)
531 self.items = []
532 self._level = 0
533
534 def handle_starttag(self, tag, attrs):
535 if tag == 'li' and self._level == 0:
536 self.items.append(dict(attrs))
537 self._level += 1
538
539 def handle_endtag(self, tag):
540 self._level -= 1
541
542
543 def extract_attributes(html_element):
544 """Given a string for an HTML element such as
545 <el
546 a="foo" B="bar" c="&98;az" d=boz
547 empty= noval entity="&amp;"
548 sq='"' dq="'"
549 >
550 Decode and return a dictionary of attributes.
551 {
552 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
553 'empty': '', 'noval': None, 'entity': '&',
554 'sq': '"', 'dq': '\''
555 }.
556 """
557 parser = HTMLAttributeParser()
558 with contextlib.suppress(compat_HTMLParseError):
559 parser.feed(html_element)
560 parser.close()
561 return parser.attrs
562
563
564 def parse_list(webpage):
565 """Given a string for an series of HTML <li> elements,
566 return a dictionary of their attributes"""
567 parser = HTMLListAttrsParser()
568 parser.feed(webpage)
569 parser.close()
570 return parser.items
571
572
573 def clean_html(html):
574 """Clean an HTML snippet into a readable string"""
575
576 if html is None: # Convenience for sanitizing descriptions etc.
577 return html
578
579 html = re.sub(r'\s+', ' ', html)
580 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
581 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
582 # Strip html tags
583 html = re.sub('<.*?>', '', html)
584 # Replace html entities
585 html = unescapeHTML(html)
586 return html.strip()
587
588
589 class LenientJSONDecoder(json.JSONDecoder):
590 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
591 self.transform_source, self.ignore_extra = transform_source, ignore_extra
592 super().__init__(*args, **kwargs)
593
594 def decode(self, s):
595 if self.transform_source:
596 s = self.transform_source(s)
597 if self.ignore_extra:
598 return self.raw_decode(s.lstrip())[0]
599 return super().decode(s)
600
601
602 def sanitize_open(filename, open_mode):
603 """Try to open the given filename, and slightly tweak it if this fails.
604
605 Attempts to open the given filename. If this fails, it tries to change
606 the filename slightly, step by step, until it's either able to open it
607 or it fails and raises a final exception, like the standard open()
608 function.
609
610 It returns the tuple (stream, definitive_file_name).
611 """
612 if filename == '-':
613 if sys.platform == 'win32':
614 import msvcrt
615 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
616 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
617
618 for attempt in range(2):
619 try:
620 try:
621 if sys.platform == 'win32':
622 # FIXME: An exclusive lock also locks the file from being read.
623 # Since windows locks are mandatory, don't lock the file on windows (for now).
624 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
625 raise LockingUnsupportedError()
626 stream = locked_file(filename, open_mode, block=False).__enter__()
627 except OSError:
628 stream = open(filename, open_mode)
629 return stream, filename
630 except OSError as err:
631 if attempt or err.errno in (errno.EACCES,):
632 raise
633 old_filename, filename = filename, sanitize_path(filename)
634 if old_filename == filename:
635 raise
636
637
638 def timeconvert(timestr):
639 """Convert RFC 2822 defined time string into system timestamp"""
640 timestamp = None
641 timetuple = email.utils.parsedate_tz(timestr)
642 if timetuple is not None:
643 timestamp = email.utils.mktime_tz(timetuple)
644 return timestamp
645
646
647 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
648 """Sanitizes a string so it could be used as part of a filename.
649 @param restricted Use a stricter subset of allowed characters
650 @param is_id Whether this is an ID that should be kept unchanged if possible.
651 If unset, yt-dlp's new sanitization rules are in effect
652 """
653 if s == '':
654 return ''
655
656 def replace_insane(char):
657 if restricted and char in ACCENT_CHARS:
658 return ACCENT_CHARS[char]
659 elif not restricted and char == '\n':
660 return '\0 '
661 elif char == '?' or ord(char) < 32 or ord(char) == 127:
662 return ''
663 elif char == '"':
664 return '' if restricted else '\''
665 elif char == ':':
666 return '\0_\0-' if restricted else '\0 \0-'
667 elif char in '\\/|*<>':
668 return '\0_'
669 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
670 return '\0_'
671 return char
672
673 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
674 result = ''.join(map(replace_insane, s))
675 if is_id is NO_DEFAULT:
676 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
677 STRIP_RE = r'(?:\0.|[ _-])*'
678 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
679 result = result.replace('\0', '') or '_'
680
681 if not is_id:
682 while '__' in result:
683 result = result.replace('__', '_')
684 result = result.strip('_')
685 # Common case of "Foreign band name - English song title"
686 if restricted and result.startswith('-_'):
687 result = result[2:]
688 if result.startswith('-'):
689 result = '_' + result[len('-'):]
690 result = result.lstrip('.')
691 if not result:
692 result = '_'
693 return result
694
695
696 def sanitize_path(s, force=False):
697 """Sanitizes and normalizes path on Windows"""
698 if sys.platform == 'win32':
699 force = False
700 drive_or_unc, _ = os.path.splitdrive(s)
701 elif force:
702 drive_or_unc = ''
703 else:
704 return s
705
706 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
707 if drive_or_unc:
708 norm_path.pop(0)
709 sanitized_path = [
710 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
711 for path_part in norm_path]
712 if drive_or_unc:
713 sanitized_path.insert(0, drive_or_unc + os.path.sep)
714 elif force and s and s[0] == os.path.sep:
715 sanitized_path.insert(0, os.path.sep)
716 return os.path.join(*sanitized_path)
717
718
719 def sanitize_url(url):
720 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
721 # the number of unwanted failures due to missing protocol
722 if url is None:
723 return
724 elif url.startswith('//'):
725 return 'http:%s' % url
726 # Fix some common typos seen so far
727 COMMON_TYPOS = (
728 # https://github.com/ytdl-org/youtube-dl/issues/15649
729 (r'^httpss://', r'https://'),
730 # https://bx1.be/lives/direct-tv/
731 (r'^rmtp([es]?)://', r'rtmp\1://'),
732 )
733 for mistake, fixup in COMMON_TYPOS:
734 if re.match(mistake, url):
735 return re.sub(mistake, fixup, url)
736 return url
737
738
739 def extract_basic_auth(url):
740 parts = urllib.parse.urlsplit(url)
741 if parts.username is None:
742 return url, None
743 url = urllib.parse.urlunsplit(parts._replace(netloc=(
744 parts.hostname if parts.port is None
745 else '%s:%d' % (parts.hostname, parts.port))))
746 auth_payload = base64.b64encode(
747 ('%s:%s' % (parts.username, parts.password or '')).encode())
748 return url, f'Basic {auth_payload.decode()}'
749
750
751 def sanitized_Request(url, *args, **kwargs):
752 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
753 if auth_header is not None:
754 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
755 headers['Authorization'] = auth_header
756 return urllib.request.Request(url, *args, **kwargs)
757
758
759 def expand_path(s):
760 """Expand shell variables and ~"""
761 return os.path.expandvars(compat_expanduser(s))
762
763
764 def orderedSet(iterable, *, lazy=False):
765 """Remove all duplicates from the input iterable"""
766 def _iter():
767 seen = [] # Do not use set since the items can be unhashable
768 for x in iterable:
769 if x not in seen:
770 seen.append(x)
771 yield x
772
773 return _iter() if lazy else list(_iter())
774
775
776 def _htmlentity_transform(entity_with_semicolon):
777 """Transforms an HTML entity to a character."""
778 entity = entity_with_semicolon[:-1]
779
780 # Known non-numeric HTML entity
781 if entity in html.entities.name2codepoint:
782 return chr(html.entities.name2codepoint[entity])
783
784 # TODO: HTML5 allows entities without a semicolon. For example,
785 # '&Eacuteric' should be decoded as 'Éric'.
786 if entity_with_semicolon in html.entities.html5:
787 return html.entities.html5[entity_with_semicolon]
788
789 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
790 if mobj is not None:
791 numstr = mobj.group(1)
792 if numstr.startswith('x'):
793 base = 16
794 numstr = '0%s' % numstr
795 else:
796 base = 10
797 # See https://github.com/ytdl-org/youtube-dl/issues/7518
798 with contextlib.suppress(ValueError):
799 return chr(int(numstr, base))
800
801 # Unknown entity in name, return its literal representation
802 return '&%s;' % entity
803
804
805 def unescapeHTML(s):
806 if s is None:
807 return None
808 assert isinstance(s, str)
809
810 return re.sub(
811 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
812
813
814 def escapeHTML(text):
815 return (
816 text
817 .replace('&', '&amp;')
818 .replace('<', '&lt;')
819 .replace('>', '&gt;')
820 .replace('"', '&quot;')
821 .replace("'", '&#39;')
822 )
823
824
825 def process_communicate_or_kill(p, *args, **kwargs):
826 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
827 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
828 return Popen.communicate_or_kill(p, *args, **kwargs)
829
830
831 class Popen(subprocess.Popen):
832 if sys.platform == 'win32':
833 _startupinfo = subprocess.STARTUPINFO()
834 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
835 else:
836 _startupinfo = None
837
838 def __init__(self, *args, text=False, **kwargs):
839 if text is True:
840 kwargs['universal_newlines'] = True # For 3.6 compatibility
841 kwargs.setdefault('encoding', 'utf-8')
842 kwargs.setdefault('errors', 'replace')
843 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
844
845 def communicate_or_kill(self, *args, **kwargs):
846 try:
847 return self.communicate(*args, **kwargs)
848 except BaseException: # Including KeyboardInterrupt
849 self.kill(timeout=None)
850 raise
851
852 def kill(self, *, timeout=0):
853 super().kill()
854 if timeout != 0:
855 self.wait(timeout=timeout)
856
857 @classmethod
858 def run(cls, *args, **kwargs):
859 with cls(*args, **kwargs) as proc:
860 stdout, stderr = proc.communicate_or_kill()
861 return stdout or '', stderr or '', proc.returncode
862
863
864 def get_subprocess_encoding():
865 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
866 # For subprocess calls, encode with locale encoding
867 # Refer to http://stackoverflow.com/a/9951851/35070
868 encoding = preferredencoding()
869 else:
870 encoding = sys.getfilesystemencoding()
871 if encoding is None:
872 encoding = 'utf-8'
873 return encoding
874
875
876 def encodeFilename(s, for_subprocess=False):
877 assert isinstance(s, str)
878 return s
879
880
881 def decodeFilename(b, for_subprocess=False):
882 return b
883
884
885 def encodeArgument(s):
886 # Legacy code that uses byte strings
887 # Uncomment the following line after fixing all post processors
888 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
889 return s if isinstance(s, str) else s.decode('ascii')
890
891
892 def decodeArgument(b):
893 return b
894
895
896 def decodeOption(optval):
897 if optval is None:
898 return optval
899 if isinstance(optval, bytes):
900 optval = optval.decode(preferredencoding())
901
902 assert isinstance(optval, str)
903 return optval
904
905
906 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
907
908
909 def timetuple_from_msec(msec):
910 secs, msec = divmod(msec, 1000)
911 mins, secs = divmod(secs, 60)
912 hrs, mins = divmod(mins, 60)
913 return _timetuple(hrs, mins, secs, msec)
914
915
916 def formatSeconds(secs, delim=':', msec=False):
917 time = timetuple_from_msec(secs * 1000)
918 if time.hours:
919 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
920 elif time.minutes:
921 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
922 else:
923 ret = '%d' % time.seconds
924 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
925
926
927 def _ssl_load_windows_store_certs(ssl_context, storename):
928 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
929 try:
930 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
931 if encoding == 'x509_asn' and (
932 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
933 except PermissionError:
934 return
935 for cert in certs:
936 with contextlib.suppress(ssl.SSLError):
937 ssl_context.load_verify_locations(cadata=cert)
938
939
940 def make_HTTPS_handler(params, **kwargs):
941 opts_check_certificate = not params.get('nocheckcertificate')
942 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
943 context.check_hostname = opts_check_certificate
944 if params.get('legacyserverconnect'):
945 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
946 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
947 context.set_ciphers('DEFAULT')
948
949 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
950 if opts_check_certificate:
951 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
952 context.load_verify_locations(cafile=certifi.where())
953 else:
954 try:
955 context.load_default_certs()
956 # Work around the issue in load_default_certs when there are bad certificates. See:
957 # https://github.com/yt-dlp/yt-dlp/issues/1060,
958 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
959 except ssl.SSLError:
960 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
961 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
962 for storename in ('CA', 'ROOT'):
963 _ssl_load_windows_store_certs(context, storename)
964 context.set_default_verify_paths()
965
966 client_certfile = params.get('client_certificate')
967 if client_certfile:
968 try:
969 context.load_cert_chain(
970 client_certfile, keyfile=params.get('client_certificate_key'),
971 password=params.get('client_certificate_password'))
972 except ssl.SSLError:
973 raise YoutubeDLError('Unable to load client certificate')
974
975 # Some servers may reject requests if ALPN extension is not sent. See:
976 # https://github.com/python/cpython/issues/85140
977 # https://github.com/yt-dlp/yt-dlp/issues/3878
978 with contextlib.suppress(NotImplementedError):
979 context.set_alpn_protocols(['http/1.1'])
980
981 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
982
983
984 def bug_reports_message(before=';'):
985 from .update import REPOSITORY
986
987 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
988 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
989
990 before = before.rstrip()
991 if not before or before.endswith(('.', '!', '?')):
992 msg = msg[0].title() + msg[1:]
993
994 return (before + ' ' if before else '') + msg
995
996
997 class YoutubeDLError(Exception):
998 """Base exception for YoutubeDL errors."""
999 msg = None
1000
1001 def __init__(self, msg=None):
1002 if msg is not None:
1003 self.msg = msg
1004 elif self.msg is None:
1005 self.msg = type(self).__name__
1006 super().__init__(self.msg)
1007
1008
1009 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1010 if hasattr(ssl, 'CertificateError'):
1011 network_exceptions.append(ssl.CertificateError)
1012 network_exceptions = tuple(network_exceptions)
1013
1014
1015 class ExtractorError(YoutubeDLError):
1016 """Error during info extraction."""
1017
1018 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1019 """ tb, if given, is the original traceback (so that it can be printed out).
1020 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1021 """
1022 if sys.exc_info()[0] in network_exceptions:
1023 expected = True
1024
1025 self.orig_msg = str(msg)
1026 self.traceback = tb
1027 self.expected = expected
1028 self.cause = cause
1029 self.video_id = video_id
1030 self.ie = ie
1031 self.exc_info = sys.exc_info() # preserve original exception
1032 if isinstance(self.exc_info[1], ExtractorError):
1033 self.exc_info = self.exc_info[1].exc_info
1034
1035 super().__init__(''.join((
1036 format_field(ie, None, '[%s] '),
1037 format_field(video_id, None, '%s: '),
1038 msg,
1039 format_field(cause, None, ' (caused by %r)'),
1040 '' if expected else bug_reports_message())))
1041
1042 def format_traceback(self):
1043 return join_nonempty(
1044 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1045 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1046 delim='\n') or None
1047
1048
1049 class UnsupportedError(ExtractorError):
1050 def __init__(self, url):
1051 super().__init__(
1052 'Unsupported URL: %s' % url, expected=True)
1053 self.url = url
1054
1055
1056 class RegexNotFoundError(ExtractorError):
1057 """Error when a regex didn't match"""
1058 pass
1059
1060
1061 class GeoRestrictedError(ExtractorError):
1062 """Geographic restriction Error exception.
1063
1064 This exception may be thrown when a video is not available from your
1065 geographic location due to geographic restrictions imposed by a website.
1066 """
1067
1068 def __init__(self, msg, countries=None, **kwargs):
1069 kwargs['expected'] = True
1070 super().__init__(msg, **kwargs)
1071 self.countries = countries
1072
1073
1074 class DownloadError(YoutubeDLError):
1075 """Download Error exception.
1076
1077 This exception may be thrown by FileDownloader objects if they are not
1078 configured to continue on errors. They will contain the appropriate
1079 error message.
1080 """
1081
1082 def __init__(self, msg, exc_info=None):
1083 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1084 super().__init__(msg)
1085 self.exc_info = exc_info
1086
1087
1088 class EntryNotInPlaylist(YoutubeDLError):
1089 """Entry not in playlist exception.
1090
1091 This exception will be thrown by YoutubeDL when a requested entry
1092 is not found in the playlist info_dict
1093 """
1094 msg = 'Entry not found in info'
1095
1096
1097 class SameFileError(YoutubeDLError):
1098 """Same File exception.
1099
1100 This exception will be thrown by FileDownloader objects if they detect
1101 multiple files would have to be downloaded to the same file on disk.
1102 """
1103 msg = 'Fixed output name but more than one file to download'
1104
1105 def __init__(self, filename=None):
1106 if filename is not None:
1107 self.msg += f': {filename}'
1108 super().__init__(self.msg)
1109
1110
1111 class PostProcessingError(YoutubeDLError):
1112 """Post Processing exception.
1113
1114 This exception may be raised by PostProcessor's .run() method to
1115 indicate an error in the postprocessing task.
1116 """
1117
1118
1119 class DownloadCancelled(YoutubeDLError):
1120 """ Exception raised when the download queue should be interrupted """
1121 msg = 'The download was cancelled'
1122
1123
1124 class ExistingVideoReached(DownloadCancelled):
1125 """ --break-on-existing triggered """
1126 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1127
1128
1129 class RejectedVideoReached(DownloadCancelled):
1130 """ --break-on-reject triggered """
1131 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1132
1133
1134 class MaxDownloadsReached(DownloadCancelled):
1135 """ --max-downloads limit has been reached. """
1136 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1137
1138
1139 class ReExtractInfo(YoutubeDLError):
1140 """ Video info needs to be re-extracted. """
1141
1142 def __init__(self, msg, expected=False):
1143 super().__init__(msg)
1144 self.expected = expected
1145
1146
1147 class ThrottledDownload(ReExtractInfo):
1148 """ Download speed below --throttled-rate. """
1149 msg = 'The download speed is below throttle limit'
1150
1151 def __init__(self):
1152 super().__init__(self.msg, expected=False)
1153
1154
1155 class UnavailableVideoError(YoutubeDLError):
1156 """Unavailable Format exception.
1157
1158 This exception will be thrown when a video is requested
1159 in a format that is not available for that video.
1160 """
1161 msg = 'Unable to download video'
1162
1163 def __init__(self, err=None):
1164 if err is not None:
1165 self.msg += f': {err}'
1166 super().__init__(self.msg)
1167
1168
1169 class ContentTooShortError(YoutubeDLError):
1170 """Content Too Short exception.
1171
1172 This exception may be raised by FileDownloader objects when a file they
1173 download is too small for what the server announced first, indicating
1174 the connection was probably interrupted.
1175 """
1176
1177 def __init__(self, downloaded, expected):
1178 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1179 # Both in bytes
1180 self.downloaded = downloaded
1181 self.expected = expected
1182
1183
1184 class XAttrMetadataError(YoutubeDLError):
1185 def __init__(self, code=None, msg='Unknown error'):
1186 super().__init__(msg)
1187 self.code = code
1188 self.msg = msg
1189
1190 # Parsing code and msg
1191 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1192 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1193 self.reason = 'NO_SPACE'
1194 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1195 self.reason = 'VALUE_TOO_LONG'
1196 else:
1197 self.reason = 'NOT_SUPPORTED'
1198
1199
1200 class XAttrUnavailableError(YoutubeDLError):
1201 pass
1202
1203
1204 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1205 hc = http_class(*args, **kwargs)
1206 source_address = ydl_handler._params.get('source_address')
1207
1208 if source_address is not None:
1209 # This is to workaround _create_connection() from socket where it will try all
1210 # address data from getaddrinfo() including IPv6. This filters the result from
1211 # getaddrinfo() based on the source_address value.
1212 # This is based on the cpython socket.create_connection() function.
1213 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1214 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1215 host, port = address
1216 err = None
1217 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1218 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1219 ip_addrs = [addr for addr in addrs if addr[0] == af]
1220 if addrs and not ip_addrs:
1221 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1222 raise OSError(
1223 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1224 % (ip_version, source_address[0]))
1225 for res in ip_addrs:
1226 af, socktype, proto, canonname, sa = res
1227 sock = None
1228 try:
1229 sock = socket.socket(af, socktype, proto)
1230 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1231 sock.settimeout(timeout)
1232 sock.bind(source_address)
1233 sock.connect(sa)
1234 err = None # Explicitly break reference cycle
1235 return sock
1236 except OSError as _:
1237 err = _
1238 if sock is not None:
1239 sock.close()
1240 if err is not None:
1241 raise err
1242 else:
1243 raise OSError('getaddrinfo returns an empty list')
1244 if hasattr(hc, '_create_connection'):
1245 hc._create_connection = _create_connection
1246 hc.source_address = (source_address, 0)
1247
1248 return hc
1249
1250
1251 def handle_youtubedl_headers(headers):
1252 filtered_headers = headers
1253
1254 if 'Youtubedl-no-compression' in filtered_headers:
1255 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1256 del filtered_headers['Youtubedl-no-compression']
1257
1258 return filtered_headers
1259
1260
1261 class YoutubeDLHandler(urllib.request.HTTPHandler):
1262 """Handler for HTTP requests and responses.
1263
1264 This class, when installed with an OpenerDirector, automatically adds
1265 the standard headers to every HTTP request and handles gzipped and
1266 deflated responses from web servers. If compression is to be avoided in
1267 a particular request, the original request in the program code only has
1268 to include the HTTP header "Youtubedl-no-compression", which will be
1269 removed before making the real request.
1270
1271 Part of this code was copied from:
1272
1273 http://techknack.net/python-urllib2-handlers/
1274
1275 Andrew Rowls, the author of that code, agreed to release it to the
1276 public domain.
1277 """
1278
1279 def __init__(self, params, *args, **kwargs):
1280 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1281 self._params = params
1282
1283 def http_open(self, req):
1284 conn_class = http.client.HTTPConnection
1285
1286 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1287 if socks_proxy:
1288 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1289 del req.headers['Ytdl-socks-proxy']
1290
1291 return self.do_open(functools.partial(
1292 _create_http_connection, self, conn_class, False),
1293 req)
1294
1295 @staticmethod
1296 def deflate(data):
1297 if not data:
1298 return data
1299 try:
1300 return zlib.decompress(data, -zlib.MAX_WBITS)
1301 except zlib.error:
1302 return zlib.decompress(data)
1303
1304 @staticmethod
1305 def brotli(data):
1306 if not data:
1307 return data
1308 return brotli.decompress(data)
1309
1310 def http_request(self, req):
1311 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1312 # always respected by websites, some tend to give out URLs with non percent-encoded
1313 # non-ASCII characters (see telemb.py, ard.py [#3412])
1314 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1315 # To work around aforementioned issue we will replace request's original URL with
1316 # percent-encoded one
1317 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1318 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1319 url = req.get_full_url()
1320 url_escaped = escape_url(url)
1321
1322 # Substitute URL if any change after escaping
1323 if url != url_escaped:
1324 req = update_Request(req, url=url_escaped)
1325
1326 for h, v in self._params.get('http_headers', std_headers).items():
1327 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1328 # The dict keys are capitalized because of this bug by urllib
1329 if h.capitalize() not in req.headers:
1330 req.add_header(h, v)
1331
1332 if 'Accept-encoding' not in req.headers:
1333 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1334
1335 req.headers = handle_youtubedl_headers(req.headers)
1336
1337 return super().do_request_(req)
1338
1339 def http_response(self, req, resp):
1340 old_resp = resp
1341 # gzip
1342 if resp.headers.get('Content-encoding', '') == 'gzip':
1343 content = resp.read()
1344 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1345 try:
1346 uncompressed = io.BytesIO(gz.read())
1347 except OSError as original_ioerror:
1348 # There may be junk add the end of the file
1349 # See http://stackoverflow.com/q/4928560/35070 for details
1350 for i in range(1, 1024):
1351 try:
1352 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1353 uncompressed = io.BytesIO(gz.read())
1354 except OSError:
1355 continue
1356 break
1357 else:
1358 raise original_ioerror
1359 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1360 resp.msg = old_resp.msg
1361 del resp.headers['Content-encoding']
1362 # deflate
1363 if resp.headers.get('Content-encoding', '') == 'deflate':
1364 gz = io.BytesIO(self.deflate(resp.read()))
1365 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1366 resp.msg = old_resp.msg
1367 del resp.headers['Content-encoding']
1368 # brotli
1369 if resp.headers.get('Content-encoding', '') == 'br':
1370 resp = urllib.request.addinfourl(
1371 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1372 resp.msg = old_resp.msg
1373 del resp.headers['Content-encoding']
1374 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1375 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1376 if 300 <= resp.code < 400:
1377 location = resp.headers.get('Location')
1378 if location:
1379 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1380 location = location.encode('iso-8859-1').decode()
1381 location_escaped = escape_url(location)
1382 if location != location_escaped:
1383 del resp.headers['Location']
1384 resp.headers['Location'] = location_escaped
1385 return resp
1386
1387 https_request = http_request
1388 https_response = http_response
1389
1390
1391 def make_socks_conn_class(base_class, socks_proxy):
1392 assert issubclass(base_class, (
1393 http.client.HTTPConnection, http.client.HTTPSConnection))
1394
1395 url_components = urllib.parse.urlparse(socks_proxy)
1396 if url_components.scheme.lower() == 'socks5':
1397 socks_type = ProxyType.SOCKS5
1398 elif url_components.scheme.lower() in ('socks', 'socks4'):
1399 socks_type = ProxyType.SOCKS4
1400 elif url_components.scheme.lower() == 'socks4a':
1401 socks_type = ProxyType.SOCKS4A
1402
1403 def unquote_if_non_empty(s):
1404 if not s:
1405 return s
1406 return urllib.parse.unquote_plus(s)
1407
1408 proxy_args = (
1409 socks_type,
1410 url_components.hostname, url_components.port or 1080,
1411 True, # Remote DNS
1412 unquote_if_non_empty(url_components.username),
1413 unquote_if_non_empty(url_components.password),
1414 )
1415
1416 class SocksConnection(base_class):
1417 def connect(self):
1418 self.sock = sockssocket()
1419 self.sock.setproxy(*proxy_args)
1420 if isinstance(self.timeout, (int, float)):
1421 self.sock.settimeout(self.timeout)
1422 self.sock.connect((self.host, self.port))
1423
1424 if isinstance(self, http.client.HTTPSConnection):
1425 if hasattr(self, '_context'): # Python > 2.6
1426 self.sock = self._context.wrap_socket(
1427 self.sock, server_hostname=self.host)
1428 else:
1429 self.sock = ssl.wrap_socket(self.sock)
1430
1431 return SocksConnection
1432
1433
1434 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1435 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1436 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1437 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1438 self._params = params
1439
1440 def https_open(self, req):
1441 kwargs = {}
1442 conn_class = self._https_conn_class
1443
1444 if hasattr(self, '_context'): # python > 2.6
1445 kwargs['context'] = self._context
1446 if hasattr(self, '_check_hostname'): # python 3.x
1447 kwargs['check_hostname'] = self._check_hostname
1448
1449 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1450 if socks_proxy:
1451 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1452 del req.headers['Ytdl-socks-proxy']
1453
1454 try:
1455 return self.do_open(
1456 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1457 except urllib.error.URLError as e:
1458 if (isinstance(e.reason, ssl.SSLError)
1459 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1460 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1461 raise
1462
1463
1464 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1465 """
1466 See [1] for cookie file format.
1467
1468 1. https://curl.haxx.se/docs/http-cookies.html
1469 """
1470 _HTTPONLY_PREFIX = '#HttpOnly_'
1471 _ENTRY_LEN = 7
1472 _HEADER = '''# Netscape HTTP Cookie File
1473 # This file is generated by yt-dlp. Do not edit.
1474
1475 '''
1476 _CookieFileEntry = collections.namedtuple(
1477 'CookieFileEntry',
1478 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1479
1480 def __init__(self, filename=None, *args, **kwargs):
1481 super().__init__(None, *args, **kwargs)
1482 if self.is_path(filename):
1483 filename = os.fspath(filename)
1484 self.filename = filename
1485
1486 @staticmethod
1487 def _true_or_false(cndn):
1488 return 'TRUE' if cndn else 'FALSE'
1489
1490 @staticmethod
1491 def is_path(file):
1492 return isinstance(file, (str, bytes, os.PathLike))
1493
1494 @contextlib.contextmanager
1495 def open(self, file, *, write=False):
1496 if self.is_path(file):
1497 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1498 yield f
1499 else:
1500 if write:
1501 file.truncate(0)
1502 yield file
1503
1504 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1505 now = time.time()
1506 for cookie in self:
1507 if (not ignore_discard and cookie.discard
1508 or not ignore_expires and cookie.is_expired(now)):
1509 continue
1510 name, value = cookie.name, cookie.value
1511 if value is None:
1512 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1513 # with no name, whereas http.cookiejar regards it as a
1514 # cookie with no value.
1515 name, value = '', name
1516 f.write('%s\n' % '\t'.join((
1517 cookie.domain,
1518 self._true_or_false(cookie.domain.startswith('.')),
1519 cookie.path,
1520 self._true_or_false(cookie.secure),
1521 str_or_none(cookie.expires, default=''),
1522 name, value
1523 )))
1524
1525 def save(self, filename=None, *args, **kwargs):
1526 """
1527 Save cookies to a file.
1528 Code is taken from CPython 3.6
1529 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1530
1531 if filename is None:
1532 if self.filename is not None:
1533 filename = self.filename
1534 else:
1535 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1536
1537 # Store session cookies with `expires` set to 0 instead of an empty string
1538 for cookie in self:
1539 if cookie.expires is None:
1540 cookie.expires = 0
1541
1542 with self.open(filename, write=True) as f:
1543 f.write(self._HEADER)
1544 self._really_save(f, *args, **kwargs)
1545
1546 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1547 """Load cookies from a file."""
1548 if filename is None:
1549 if self.filename is not None:
1550 filename = self.filename
1551 else:
1552 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1553
1554 def prepare_line(line):
1555 if line.startswith(self._HTTPONLY_PREFIX):
1556 line = line[len(self._HTTPONLY_PREFIX):]
1557 # comments and empty lines are fine
1558 if line.startswith('#') or not line.strip():
1559 return line
1560 cookie_list = line.split('\t')
1561 if len(cookie_list) != self._ENTRY_LEN:
1562 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1563 cookie = self._CookieFileEntry(*cookie_list)
1564 if cookie.expires_at and not cookie.expires_at.isdigit():
1565 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1566 return line
1567
1568 cf = io.StringIO()
1569 with self.open(filename) as f:
1570 for line in f:
1571 try:
1572 cf.write(prepare_line(line))
1573 except http.cookiejar.LoadError as e:
1574 if f'{line.strip()} '[0] in '[{"':
1575 raise http.cookiejar.LoadError(
1576 'Cookies file must be Netscape formatted, not JSON. See '
1577 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1578 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1579 continue
1580 cf.seek(0)
1581 self._really_load(cf, filename, ignore_discard, ignore_expires)
1582 # Session cookies are denoted by either `expires` field set to
1583 # an empty string or 0. MozillaCookieJar only recognizes the former
1584 # (see [1]). So we need force the latter to be recognized as session
1585 # cookies on our own.
1586 # Session cookies may be important for cookies-based authentication,
1587 # e.g. usually, when user does not check 'Remember me' check box while
1588 # logging in on a site, some important cookies are stored as session
1589 # cookies so that not recognizing them will result in failed login.
1590 # 1. https://bugs.python.org/issue17164
1591 for cookie in self:
1592 # Treat `expires=0` cookies as session cookies
1593 if cookie.expires == 0:
1594 cookie.expires = None
1595 cookie.discard = True
1596
1597
1598 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1599 def __init__(self, cookiejar=None):
1600 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1601
1602 def http_response(self, request, response):
1603 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1604
1605 https_request = urllib.request.HTTPCookieProcessor.http_request
1606 https_response = http_response
1607
1608
1609 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1610 """YoutubeDL redirect handler
1611
1612 The code is based on HTTPRedirectHandler implementation from CPython [1].
1613
1614 This redirect handler solves two issues:
1615 - ensures redirect URL is always unicode under python 2
1616 - introduces support for experimental HTTP response status code
1617 308 Permanent Redirect [2] used by some sites [3]
1618
1619 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1620 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1621 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1622 """
1623
1624 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1625
1626 def redirect_request(self, req, fp, code, msg, headers, newurl):
1627 """Return a Request or None in response to a redirect.
1628
1629 This is called by the http_error_30x methods when a
1630 redirection response is received. If a redirection should
1631 take place, return a new Request to allow http_error_30x to
1632 perform the redirect. Otherwise, raise HTTPError if no-one
1633 else should try to handle this url. Return None if you can't
1634 but another Handler might.
1635 """
1636 m = req.get_method()
1637 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1638 or code in (301, 302, 303) and m == "POST")):
1639 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1640 # Strictly (according to RFC 2616), 301 or 302 in response to
1641 # a POST MUST NOT cause a redirection without confirmation
1642 # from the user (of urllib.request, in this case). In practice,
1643 # essentially all clients do redirect in this case, so we do
1644 # the same.
1645
1646 # Be conciliant with URIs containing a space. This is mainly
1647 # redundant with the more complete encoding done in http_error_302(),
1648 # but it is kept for compatibility with other callers.
1649 newurl = newurl.replace(' ', '%20')
1650
1651 CONTENT_HEADERS = ("content-length", "content-type")
1652 # NB: don't use dict comprehension for python 2.6 compatibility
1653 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1654
1655 # A 303 must either use GET or HEAD for subsequent request
1656 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1657 if code == 303 and m != 'HEAD':
1658 m = 'GET'
1659 # 301 and 302 redirects are commonly turned into a GET from a POST
1660 # for subsequent requests by browsers, so we'll do the same.
1661 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1662 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1663 if code in (301, 302) and m == 'POST':
1664 m = 'GET'
1665
1666 return urllib.request.Request(
1667 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1668 unverifiable=True, method=m)
1669
1670
1671 def extract_timezone(date_str):
1672 m = re.search(
1673 r'''(?x)
1674 ^.{8,}? # >=8 char non-TZ prefix, if present
1675 (?P<tz>Z| # just the UTC Z, or
1676 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1677 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1678 [ ]? # optional space
1679 (?P<sign>\+|-) # +/-
1680 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1681 $)
1682 ''', date_str)
1683 if not m:
1684 timezone = datetime.timedelta()
1685 else:
1686 date_str = date_str[:-len(m.group('tz'))]
1687 if not m.group('sign'):
1688 timezone = datetime.timedelta()
1689 else:
1690 sign = 1 if m.group('sign') == '+' else -1
1691 timezone = datetime.timedelta(
1692 hours=sign * int(m.group('hours')),
1693 minutes=sign * int(m.group('minutes')))
1694 return timezone, date_str
1695
1696
1697 def parse_iso8601(date_str, delimiter='T', timezone=None):
1698 """ Return a UNIX timestamp from the given date """
1699
1700 if date_str is None:
1701 return None
1702
1703 date_str = re.sub(r'\.[0-9]+', '', date_str)
1704
1705 if timezone is None:
1706 timezone, date_str = extract_timezone(date_str)
1707
1708 with contextlib.suppress(ValueError):
1709 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1710 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1711 return calendar.timegm(dt.timetuple())
1712
1713
1714 def date_formats(day_first=True):
1715 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1716
1717
1718 def unified_strdate(date_str, day_first=True):
1719 """Return a string with the date in the format YYYYMMDD"""
1720
1721 if date_str is None:
1722 return None
1723 upload_date = None
1724 # Replace commas
1725 date_str = date_str.replace(',', ' ')
1726 # Remove AM/PM + timezone
1727 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1728 _, date_str = extract_timezone(date_str)
1729
1730 for expression in date_formats(day_first):
1731 with contextlib.suppress(ValueError):
1732 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1733 if upload_date is None:
1734 timetuple = email.utils.parsedate_tz(date_str)
1735 if timetuple:
1736 with contextlib.suppress(ValueError):
1737 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1738 if upload_date is not None:
1739 return str(upload_date)
1740
1741
1742 def unified_timestamp(date_str, day_first=True):
1743 if date_str is None:
1744 return None
1745
1746 date_str = re.sub(r'[,|]', '', date_str)
1747
1748 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1749 timezone, date_str = extract_timezone(date_str)
1750
1751 # Remove AM/PM + timezone
1752 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1753
1754 # Remove unrecognized timezones from ISO 8601 alike timestamps
1755 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1756 if m:
1757 date_str = date_str[:-len(m.group('tz'))]
1758
1759 # Python only supports microseconds, so remove nanoseconds
1760 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1761 if m:
1762 date_str = m.group(1)
1763
1764 for expression in date_formats(day_first):
1765 with contextlib.suppress(ValueError):
1766 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1767 return calendar.timegm(dt.timetuple())
1768 timetuple = email.utils.parsedate_tz(date_str)
1769 if timetuple:
1770 return calendar.timegm(timetuple) + pm_delta * 3600
1771
1772
1773 def determine_ext(url, default_ext='unknown_video'):
1774 if url is None or '.' not in url:
1775 return default_ext
1776 guess = url.partition('?')[0].rpartition('.')[2]
1777 if re.match(r'^[A-Za-z0-9]+$', guess):
1778 return guess
1779 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1780 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1781 return guess.rstrip('/')
1782 else:
1783 return default_ext
1784
1785
1786 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1787 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1788
1789
1790 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1791 R"""
1792 Return a datetime object from a string.
1793 Supported format:
1794 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1795
1796 @param format strftime format of DATE
1797 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1798 auto: round to the unit provided in date_str (if applicable).
1799 """
1800 auto_precision = False
1801 if precision == 'auto':
1802 auto_precision = True
1803 precision = 'microsecond'
1804 today = datetime_round(datetime.datetime.utcnow(), precision)
1805 if date_str in ('now', 'today'):
1806 return today
1807 if date_str == 'yesterday':
1808 return today - datetime.timedelta(days=1)
1809 match = re.match(
1810 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1811 date_str)
1812 if match is not None:
1813 start_time = datetime_from_str(match.group('start'), precision, format)
1814 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1815 unit = match.group('unit')
1816 if unit == 'month' or unit == 'year':
1817 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1818 unit = 'day'
1819 else:
1820 if unit == 'week':
1821 unit = 'day'
1822 time *= 7
1823 delta = datetime.timedelta(**{unit + 's': time})
1824 new_date = start_time + delta
1825 if auto_precision:
1826 return datetime_round(new_date, unit)
1827 return new_date
1828
1829 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1830
1831
1832 def date_from_str(date_str, format='%Y%m%d', strict=False):
1833 R"""
1834 Return a date object from a string using datetime_from_str
1835
1836 @param strict Restrict allowed patterns to "YYYYMMDD" and
1837 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1838 """
1839 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1840 raise ValueError(f'Invalid date format "{date_str}"')
1841 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1842
1843
1844 def datetime_add_months(dt, months):
1845 """Increment/Decrement a datetime object by months."""
1846 month = dt.month + months - 1
1847 year = dt.year + month // 12
1848 month = month % 12 + 1
1849 day = min(dt.day, calendar.monthrange(year, month)[1])
1850 return dt.replace(year, month, day)
1851
1852
1853 def datetime_round(dt, precision='day'):
1854 """
1855 Round a datetime object's time to a specific precision
1856 """
1857 if precision == 'microsecond':
1858 return dt
1859
1860 unit_seconds = {
1861 'day': 86400,
1862 'hour': 3600,
1863 'minute': 60,
1864 'second': 1,
1865 }
1866 roundto = lambda x, n: ((x + n / 2) // n) * n
1867 timestamp = calendar.timegm(dt.timetuple())
1868 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1869
1870
1871 def hyphenate_date(date_str):
1872 """
1873 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1874 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1875 if match is not None:
1876 return '-'.join(match.groups())
1877 else:
1878 return date_str
1879
1880
1881 class DateRange:
1882 """Represents a time interval between two dates"""
1883
1884 def __init__(self, start=None, end=None):
1885 """start and end must be strings in the format accepted by date"""
1886 if start is not None:
1887 self.start = date_from_str(start, strict=True)
1888 else:
1889 self.start = datetime.datetime.min.date()
1890 if end is not None:
1891 self.end = date_from_str(end, strict=True)
1892 else:
1893 self.end = datetime.datetime.max.date()
1894 if self.start > self.end:
1895 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1896
1897 @classmethod
1898 def day(cls, day):
1899 """Returns a range that only contains the given day"""
1900 return cls(day, day)
1901
1902 def __contains__(self, date):
1903 """Check if the date is in the range"""
1904 if not isinstance(date, datetime.date):
1905 date = date_from_str(date)
1906 return self.start <= date <= self.end
1907
1908 def __str__(self):
1909 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1910
1911 def __eq__(self, other):
1912 return (isinstance(other, DateRange)
1913 and self.start == other.start and self.end == other.end)
1914
1915
1916 def platform_name():
1917 """ Returns the platform name as a str """
1918 write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1919 return platform.platform()
1920
1921
1922 @functools.cache
1923 def system_identifier():
1924 python_implementation = platform.python_implementation()
1925 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1926 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1927
1928 return 'Python %s (%s %s) - %s %s' % (
1929 platform.python_version(),
1930 python_implementation,
1931 platform.architecture()[0],
1932 platform.platform(),
1933 format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1934 )
1935
1936
1937 @functools.cache
1938 def get_windows_version():
1939 ''' Get Windows version. returns () if it's not running on Windows '''
1940 if compat_os_name == 'nt':
1941 return version_tuple(platform.win32_ver()[1])
1942 else:
1943 return ()
1944
1945
1946 def write_string(s, out=None, encoding=None):
1947 assert isinstance(s, str)
1948 out = out or sys.stderr
1949
1950 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1951 s = re.sub(r'([\r\n]+)', r' \1', s)
1952
1953 enc, buffer = None, out
1954 if 'b' in getattr(out, 'mode', ''):
1955 enc = encoding or preferredencoding()
1956 elif hasattr(out, 'buffer'):
1957 buffer = out.buffer
1958 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1959
1960 buffer.write(s.encode(enc, 'ignore') if enc else s)
1961 out.flush()
1962
1963
1964 def bytes_to_intlist(bs):
1965 if not bs:
1966 return []
1967 if isinstance(bs[0], int): # Python 3
1968 return list(bs)
1969 else:
1970 return [ord(c) for c in bs]
1971
1972
1973 def intlist_to_bytes(xs):
1974 if not xs:
1975 return b''
1976 return struct.pack('%dB' % len(xs), *xs)
1977
1978
1979 class LockingUnsupportedError(OSError):
1980 msg = 'File locking is not supported'
1981
1982 def __init__(self):
1983 super().__init__(self.msg)
1984
1985
1986 # Cross-platform file locking
1987 if sys.platform == 'win32':
1988 import ctypes.wintypes
1989 import msvcrt
1990
1991 class OVERLAPPED(ctypes.Structure):
1992 _fields_ = [
1993 ('Internal', ctypes.wintypes.LPVOID),
1994 ('InternalHigh', ctypes.wintypes.LPVOID),
1995 ('Offset', ctypes.wintypes.DWORD),
1996 ('OffsetHigh', ctypes.wintypes.DWORD),
1997 ('hEvent', ctypes.wintypes.HANDLE),
1998 ]
1999
2000 kernel32 = ctypes.windll.kernel32
2001 LockFileEx = kernel32.LockFileEx
2002 LockFileEx.argtypes = [
2003 ctypes.wintypes.HANDLE, # hFile
2004 ctypes.wintypes.DWORD, # dwFlags
2005 ctypes.wintypes.DWORD, # dwReserved
2006 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2007 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2008 ctypes.POINTER(OVERLAPPED) # Overlapped
2009 ]
2010 LockFileEx.restype = ctypes.wintypes.BOOL
2011 UnlockFileEx = kernel32.UnlockFileEx
2012 UnlockFileEx.argtypes = [
2013 ctypes.wintypes.HANDLE, # hFile
2014 ctypes.wintypes.DWORD, # dwReserved
2015 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2016 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2017 ctypes.POINTER(OVERLAPPED) # Overlapped
2018 ]
2019 UnlockFileEx.restype = ctypes.wintypes.BOOL
2020 whole_low = 0xffffffff
2021 whole_high = 0x7fffffff
2022
2023 def _lock_file(f, exclusive, block):
2024 overlapped = OVERLAPPED()
2025 overlapped.Offset = 0
2026 overlapped.OffsetHigh = 0
2027 overlapped.hEvent = 0
2028 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2029
2030 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2031 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2032 0, whole_low, whole_high, f._lock_file_overlapped_p):
2033 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2034 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2035
2036 def _unlock_file(f):
2037 assert f._lock_file_overlapped_p
2038 handle = msvcrt.get_osfhandle(f.fileno())
2039 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2040 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2041
2042 else:
2043 try:
2044 import fcntl
2045
2046 def _lock_file(f, exclusive, block):
2047 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2048 if not block:
2049 flags |= fcntl.LOCK_NB
2050 try:
2051 fcntl.flock(f, flags)
2052 except BlockingIOError:
2053 raise
2054 except OSError: # AOSP does not have flock()
2055 fcntl.lockf(f, flags)
2056
2057 def _unlock_file(f):
2058 try:
2059 fcntl.flock(f, fcntl.LOCK_UN)
2060 except OSError:
2061 fcntl.lockf(f, fcntl.LOCK_UN)
2062
2063 except ImportError:
2064
2065 def _lock_file(f, exclusive, block):
2066 raise LockingUnsupportedError()
2067
2068 def _unlock_file(f):
2069 raise LockingUnsupportedError()
2070
2071
2072 class locked_file:
2073 locked = False
2074
2075 def __init__(self, filename, mode, block=True, encoding=None):
2076 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2077 raise NotImplementedError(mode)
2078 self.mode, self.block = mode, block
2079
2080 writable = any(f in mode for f in 'wax+')
2081 readable = any(f in mode for f in 'r+')
2082 flags = functools.reduce(operator.ior, (
2083 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2084 getattr(os, 'O_BINARY', 0), # Windows only
2085 getattr(os, 'O_NOINHERIT', 0), # Windows only
2086 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2087 os.O_APPEND if 'a' in mode else 0,
2088 os.O_EXCL if 'x' in mode else 0,
2089 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2090 ))
2091
2092 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2093
2094 def __enter__(self):
2095 exclusive = 'r' not in self.mode
2096 try:
2097 _lock_file(self.f, exclusive, self.block)
2098 self.locked = True
2099 except OSError:
2100 self.f.close()
2101 raise
2102 if 'w' in self.mode:
2103 try:
2104 self.f.truncate()
2105 except OSError as e:
2106 if e.errno not in (
2107 errno.ESPIPE, # Illegal seek - expected for FIFO
2108 errno.EINVAL, # Invalid argument - expected for /dev/null
2109 ):
2110 raise
2111 return self
2112
2113 def unlock(self):
2114 if not self.locked:
2115 return
2116 try:
2117 _unlock_file(self.f)
2118 finally:
2119 self.locked = False
2120
2121 def __exit__(self, *_):
2122 try:
2123 self.unlock()
2124 finally:
2125 self.f.close()
2126
2127 open = __enter__
2128 close = __exit__
2129
2130 def __getattr__(self, attr):
2131 return getattr(self.f, attr)
2132
2133 def __iter__(self):
2134 return iter(self.f)
2135
2136
2137 @functools.cache
2138 def get_filesystem_encoding():
2139 encoding = sys.getfilesystemencoding()
2140 return encoding if encoding is not None else 'utf-8'
2141
2142
2143 def shell_quote(args):
2144 quoted_args = []
2145 encoding = get_filesystem_encoding()
2146 for a in args:
2147 if isinstance(a, bytes):
2148 # We may get a filename encoded with 'encodeFilename'
2149 a = a.decode(encoding)
2150 quoted_args.append(compat_shlex_quote(a))
2151 return ' '.join(quoted_args)
2152
2153
2154 def smuggle_url(url, data):
2155 """ Pass additional data in a URL for internal use. """
2156
2157 url, idata = unsmuggle_url(url, {})
2158 data.update(idata)
2159 sdata = urllib.parse.urlencode(
2160 {'__youtubedl_smuggle': json.dumps(data)})
2161 return url + '#' + sdata
2162
2163
2164 def unsmuggle_url(smug_url, default=None):
2165 if '#__youtubedl_smuggle' not in smug_url:
2166 return smug_url, default
2167 url, _, sdata = smug_url.rpartition('#')
2168 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2169 data = json.loads(jsond)
2170 return url, data
2171
2172
2173 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2174 """ Formats numbers with decimal sufixes like K, M, etc """
2175 num, factor = float_or_none(num), float(factor)
2176 if num is None or num < 0:
2177 return None
2178 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2179 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2180 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2181 if factor == 1024:
2182 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2183 converted = num / (factor ** exponent)
2184 return fmt % (converted, suffix)
2185
2186
2187 def format_bytes(bytes):
2188 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2189
2190
2191 def lookup_unit_table(unit_table, s):
2192 units_re = '|'.join(re.escape(u) for u in unit_table)
2193 m = re.match(
2194 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2195 if not m:
2196 return None
2197 num_str = m.group('num').replace(',', '.')
2198 mult = unit_table[m.group('unit')]
2199 return int(float(num_str) * mult)
2200
2201
2202 def parse_filesize(s):
2203 if s is None:
2204 return None
2205
2206 # The lower-case forms are of course incorrect and unofficial,
2207 # but we support those too
2208 _UNIT_TABLE = {
2209 'B': 1,
2210 'b': 1,
2211 'bytes': 1,
2212 'KiB': 1024,
2213 'KB': 1000,
2214 'kB': 1024,
2215 'Kb': 1000,
2216 'kb': 1000,
2217 'kilobytes': 1000,
2218 'kibibytes': 1024,
2219 'MiB': 1024 ** 2,
2220 'MB': 1000 ** 2,
2221 'mB': 1024 ** 2,
2222 'Mb': 1000 ** 2,
2223 'mb': 1000 ** 2,
2224 'megabytes': 1000 ** 2,
2225 'mebibytes': 1024 ** 2,
2226 'GiB': 1024 ** 3,
2227 'GB': 1000 ** 3,
2228 'gB': 1024 ** 3,
2229 'Gb': 1000 ** 3,
2230 'gb': 1000 ** 3,
2231 'gigabytes': 1000 ** 3,
2232 'gibibytes': 1024 ** 3,
2233 'TiB': 1024 ** 4,
2234 'TB': 1000 ** 4,
2235 'tB': 1024 ** 4,
2236 'Tb': 1000 ** 4,
2237 'tb': 1000 ** 4,
2238 'terabytes': 1000 ** 4,
2239 'tebibytes': 1024 ** 4,
2240 'PiB': 1024 ** 5,
2241 'PB': 1000 ** 5,
2242 'pB': 1024 ** 5,
2243 'Pb': 1000 ** 5,
2244 'pb': 1000 ** 5,
2245 'petabytes': 1000 ** 5,
2246 'pebibytes': 1024 ** 5,
2247 'EiB': 1024 ** 6,
2248 'EB': 1000 ** 6,
2249 'eB': 1024 ** 6,
2250 'Eb': 1000 ** 6,
2251 'eb': 1000 ** 6,
2252 'exabytes': 1000 ** 6,
2253 'exbibytes': 1024 ** 6,
2254 'ZiB': 1024 ** 7,
2255 'ZB': 1000 ** 7,
2256 'zB': 1024 ** 7,
2257 'Zb': 1000 ** 7,
2258 'zb': 1000 ** 7,
2259 'zettabytes': 1000 ** 7,
2260 'zebibytes': 1024 ** 7,
2261 'YiB': 1024 ** 8,
2262 'YB': 1000 ** 8,
2263 'yB': 1024 ** 8,
2264 'Yb': 1000 ** 8,
2265 'yb': 1000 ** 8,
2266 'yottabytes': 1000 ** 8,
2267 'yobibytes': 1024 ** 8,
2268 }
2269
2270 return lookup_unit_table(_UNIT_TABLE, s)
2271
2272
2273 def parse_count(s):
2274 if s is None:
2275 return None
2276
2277 s = re.sub(r'^[^\d]+\s', '', s).strip()
2278
2279 if re.match(r'^[\d,.]+$', s):
2280 return str_to_int(s)
2281
2282 _UNIT_TABLE = {
2283 'k': 1000,
2284 'K': 1000,
2285 'm': 1000 ** 2,
2286 'M': 1000 ** 2,
2287 'kk': 1000 ** 2,
2288 'KK': 1000 ** 2,
2289 'b': 1000 ** 3,
2290 'B': 1000 ** 3,
2291 }
2292
2293 ret = lookup_unit_table(_UNIT_TABLE, s)
2294 if ret is not None:
2295 return ret
2296
2297 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2298 if mobj:
2299 return str_to_int(mobj.group(1))
2300
2301
2302 def parse_resolution(s, *, lenient=False):
2303 if s is None:
2304 return {}
2305
2306 if lenient:
2307 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2308 else:
2309 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2310 if mobj:
2311 return {
2312 'width': int(mobj.group('w')),
2313 'height': int(mobj.group('h')),
2314 }
2315
2316 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2317 if mobj:
2318 return {'height': int(mobj.group(1))}
2319
2320 mobj = re.search(r'\b([48])[kK]\b', s)
2321 if mobj:
2322 return {'height': int(mobj.group(1)) * 540}
2323
2324 return {}
2325
2326
2327 def parse_bitrate(s):
2328 if not isinstance(s, str):
2329 return
2330 mobj = re.search(r'\b(\d+)\s*kbps', s)
2331 if mobj:
2332 return int(mobj.group(1))
2333
2334
2335 def month_by_name(name, lang='en'):
2336 """ Return the number of a month by (locale-independently) English name """
2337
2338 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2339
2340 try:
2341 return month_names.index(name) + 1
2342 except ValueError:
2343 return None
2344
2345
2346 def month_by_abbreviation(abbrev):
2347 """ Return the number of a month by (locale-independently) English
2348 abbreviations """
2349
2350 try:
2351 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2352 except ValueError:
2353 return None
2354
2355
2356 def fix_xml_ampersands(xml_str):
2357 """Replace all the '&' by '&amp;' in XML"""
2358 return re.sub(
2359 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2360 '&amp;',
2361 xml_str)
2362
2363
2364 def setproctitle(title):
2365 assert isinstance(title, str)
2366
2367 # ctypes in Jython is not complete
2368 # http://bugs.jython.org/issue2148
2369 if sys.platform.startswith('java'):
2370 return
2371
2372 try:
2373 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2374 except OSError:
2375 return
2376 except TypeError:
2377 # LoadLibrary in Windows Python 2.7.13 only expects
2378 # a bytestring, but since unicode_literals turns
2379 # every string into a unicode string, it fails.
2380 return
2381 title_bytes = title.encode()
2382 buf = ctypes.create_string_buffer(len(title_bytes))
2383 buf.value = title_bytes
2384 try:
2385 libc.prctl(15, buf, 0, 0, 0)
2386 except AttributeError:
2387 return # Strange libc, just skip this
2388
2389
2390 def remove_start(s, start):
2391 return s[len(start):] if s is not None and s.startswith(start) else s
2392
2393
2394 def remove_end(s, end):
2395 return s[:-len(end)] if s is not None and s.endswith(end) else s
2396
2397
2398 def remove_quotes(s):
2399 if s is None or len(s) < 2:
2400 return s
2401 for quote in ('"', "'", ):
2402 if s[0] == quote and s[-1] == quote:
2403 return s[1:-1]
2404 return s
2405
2406
2407 def get_domain(url):
2408 return '.'.join(urllib.parse.urlparse(url).netloc.rsplit('.', 2)[-2:])
2409
2410
2411 def url_basename(url):
2412 path = urllib.parse.urlparse(url).path
2413 return path.strip('/').split('/')[-1]
2414
2415
2416 def base_url(url):
2417 return re.match(r'https?://[^?#&]+/', url).group()
2418
2419
2420 def urljoin(base, path):
2421 if isinstance(path, bytes):
2422 path = path.decode()
2423 if not isinstance(path, str) or not path:
2424 return None
2425 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2426 return path
2427 if isinstance(base, bytes):
2428 base = base.decode()
2429 if not isinstance(base, str) or not re.match(
2430 r'^(?:https?:)?//', base):
2431 return None
2432 return urllib.parse.urljoin(base, path)
2433
2434
2435 class HEADRequest(urllib.request.Request):
2436 def get_method(self):
2437 return 'HEAD'
2438
2439
2440 class PUTRequest(urllib.request.Request):
2441 def get_method(self):
2442 return 'PUT'
2443
2444
2445 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2446 if get_attr and v is not None:
2447 v = getattr(v, get_attr, None)
2448 try:
2449 return int(v) * invscale // scale
2450 except (ValueError, TypeError, OverflowError):
2451 return default
2452
2453
2454 def str_or_none(v, default=None):
2455 return default if v is None else str(v)
2456
2457
2458 def str_to_int(int_str):
2459 """ A more relaxed version of int_or_none """
2460 if isinstance(int_str, int):
2461 return int_str
2462 elif isinstance(int_str, str):
2463 int_str = re.sub(r'[,\.\+]', '', int_str)
2464 return int_or_none(int_str)
2465
2466
2467 def float_or_none(v, scale=1, invscale=1, default=None):
2468 if v is None:
2469 return default
2470 try:
2471 return float(v) * invscale / scale
2472 except (ValueError, TypeError):
2473 return default
2474
2475
2476 def bool_or_none(v, default=None):
2477 return v if isinstance(v, bool) else default
2478
2479
2480 def strip_or_none(v, default=None):
2481 return v.strip() if isinstance(v, str) else default
2482
2483
2484 def url_or_none(url):
2485 if not url or not isinstance(url, str):
2486 return None
2487 url = url.strip()
2488 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2489
2490
2491 def request_to_url(req):
2492 if isinstance(req, urllib.request.Request):
2493 return req.get_full_url()
2494 else:
2495 return req
2496
2497
2498 def strftime_or_none(timestamp, date_format, default=None):
2499 datetime_object = None
2500 try:
2501 if isinstance(timestamp, (int, float)): # unix timestamp
2502 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2503 elif isinstance(timestamp, str): # assume YYYYMMDD
2504 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2505 return datetime_object.strftime(date_format)
2506 except (ValueError, TypeError, AttributeError):
2507 return default
2508
2509
2510 def parse_duration(s):
2511 if not isinstance(s, str):
2512 return None
2513 s = s.strip()
2514 if not s:
2515 return None
2516
2517 days, hours, mins, secs, ms = [None] * 5
2518 m = re.match(r'''(?x)
2519 (?P<before_secs>
2520 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2521 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2522 (?P<ms>[.:][0-9]+)?Z?$
2523 ''', s)
2524 if m:
2525 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2526 else:
2527 m = re.match(
2528 r'''(?ix)(?:P?
2529 (?:
2530 [0-9]+\s*y(?:ears?)?,?\s*
2531 )?
2532 (?:
2533 [0-9]+\s*m(?:onths?)?,?\s*
2534 )?
2535 (?:
2536 [0-9]+\s*w(?:eeks?)?,?\s*
2537 )?
2538 (?:
2539 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2540 )?
2541 T)?
2542 (?:
2543 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2544 )?
2545 (?:
2546 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2547 )?
2548 (?:
2549 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2550 )?Z?$''', s)
2551 if m:
2552 days, hours, mins, secs, ms = m.groups()
2553 else:
2554 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2555 if m:
2556 hours, mins = m.groups()
2557 else:
2558 return None
2559
2560 if ms:
2561 ms = ms.replace(':', '.')
2562 return sum(float(part or 0) * mult for part, mult in (
2563 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2564
2565
2566 def prepend_extension(filename, ext, expected_real_ext=None):
2567 name, real_ext = os.path.splitext(filename)
2568 return (
2569 f'{name}.{ext}{real_ext}'
2570 if not expected_real_ext or real_ext[1:] == expected_real_ext
2571 else f'{filename}.{ext}')
2572
2573
2574 def replace_extension(filename, ext, expected_real_ext=None):
2575 name, real_ext = os.path.splitext(filename)
2576 return '{}.{}'.format(
2577 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2578 ext)
2579
2580
2581 def check_executable(exe, args=[]):
2582 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2583 args can be a list of arguments for a short output (like -version) """
2584 try:
2585 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2586 except OSError:
2587 return False
2588 return exe
2589
2590
2591 def _get_exe_version_output(exe, args, *, to_screen=None):
2592 if to_screen:
2593 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2594 try:
2595 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2596 # SIGTTOU if yt-dlp is run in the background.
2597 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2598 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2599 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2600 except OSError:
2601 return False
2602 return stdout
2603
2604
2605 def detect_exe_version(output, version_re=None, unrecognized='present'):
2606 assert isinstance(output, str)
2607 if version_re is None:
2608 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2609 m = re.search(version_re, output)
2610 if m:
2611 return m.group(1)
2612 else:
2613 return unrecognized
2614
2615
2616 def get_exe_version(exe, args=['--version'],
2617 version_re=None, unrecognized='present'):
2618 """ Returns the version of the specified executable,
2619 or False if the executable is not present """
2620 out = _get_exe_version_output(exe, args)
2621 return detect_exe_version(out, version_re, unrecognized) if out else False
2622
2623
2624 def frange(start=0, stop=None, step=1):
2625 """Float range"""
2626 if stop is None:
2627 start, stop = 0, start
2628 sign = [-1, 1][step > 0] if step else 0
2629 while sign * start < sign * stop:
2630 yield start
2631 start += step
2632
2633
2634 class LazyList(collections.abc.Sequence):
2635 """Lazy immutable list from an iterable
2636 Note that slices of a LazyList are lists and not LazyList"""
2637
2638 class IndexError(IndexError):
2639 pass
2640
2641 def __init__(self, iterable, *, reverse=False, _cache=None):
2642 self._iterable = iter(iterable)
2643 self._cache = [] if _cache is None else _cache
2644 self._reversed = reverse
2645
2646 def __iter__(self):
2647 if self._reversed:
2648 # We need to consume the entire iterable to iterate in reverse
2649 yield from self.exhaust()
2650 return
2651 yield from self._cache
2652 for item in self._iterable:
2653 self._cache.append(item)
2654 yield item
2655
2656 def _exhaust(self):
2657 self._cache.extend(self._iterable)
2658 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2659 return self._cache
2660
2661 def exhaust(self):
2662 """Evaluate the entire iterable"""
2663 return self._exhaust()[::-1 if self._reversed else 1]
2664
2665 @staticmethod
2666 def _reverse_index(x):
2667 return None if x is None else ~x
2668
2669 def __getitem__(self, idx):
2670 if isinstance(idx, slice):
2671 if self._reversed:
2672 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2673 start, stop, step = idx.start, idx.stop, idx.step or 1
2674 elif isinstance(idx, int):
2675 if self._reversed:
2676 idx = self._reverse_index(idx)
2677 start, stop, step = idx, idx, 0
2678 else:
2679 raise TypeError('indices must be integers or slices')
2680 if ((start or 0) < 0 or (stop or 0) < 0
2681 or (start is None and step < 0)
2682 or (stop is None and step > 0)):
2683 # We need to consume the entire iterable to be able to slice from the end
2684 # Obviously, never use this with infinite iterables
2685 self._exhaust()
2686 try:
2687 return self._cache[idx]
2688 except IndexError as e:
2689 raise self.IndexError(e) from e
2690 n = max(start or 0, stop or 0) - len(self._cache) + 1
2691 if n > 0:
2692 self._cache.extend(itertools.islice(self._iterable, n))
2693 try:
2694 return self._cache[idx]
2695 except IndexError as e:
2696 raise self.IndexError(e) from e
2697
2698 def __bool__(self):
2699 try:
2700 self[-1] if self._reversed else self[0]
2701 except self.IndexError:
2702 return False
2703 return True
2704
2705 def __len__(self):
2706 self._exhaust()
2707 return len(self._cache)
2708
2709 def __reversed__(self):
2710 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2711
2712 def __copy__(self):
2713 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2714
2715 def __repr__(self):
2716 # repr and str should mimic a list. So we exhaust the iterable
2717 return repr(self.exhaust())
2718
2719 def __str__(self):
2720 return repr(self.exhaust())
2721
2722
2723 class PagedList:
2724
2725 class IndexError(IndexError):
2726 pass
2727
2728 def __len__(self):
2729 # This is only useful for tests
2730 return len(self.getslice())
2731
2732 def __init__(self, pagefunc, pagesize, use_cache=True):
2733 self._pagefunc = pagefunc
2734 self._pagesize = pagesize
2735 self._pagecount = float('inf')
2736 self._use_cache = use_cache
2737 self._cache = {}
2738
2739 def getpage(self, pagenum):
2740 page_results = self._cache.get(pagenum)
2741 if page_results is None:
2742 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2743 if self._use_cache:
2744 self._cache[pagenum] = page_results
2745 return page_results
2746
2747 def getslice(self, start=0, end=None):
2748 return list(self._getslice(start, end))
2749
2750 def _getslice(self, start, end):
2751 raise NotImplementedError('This method must be implemented by subclasses')
2752
2753 def __getitem__(self, idx):
2754 assert self._use_cache, 'Indexing PagedList requires cache'
2755 if not isinstance(idx, int) or idx < 0:
2756 raise TypeError('indices must be non-negative integers')
2757 entries = self.getslice(idx, idx + 1)
2758 if not entries:
2759 raise self.IndexError()
2760 return entries[0]
2761
2762
2763 class OnDemandPagedList(PagedList):
2764 """Download pages until a page with less than maximum results"""
2765
2766 def _getslice(self, start, end):
2767 for pagenum in itertools.count(start // self._pagesize):
2768 firstid = pagenum * self._pagesize
2769 nextfirstid = pagenum * self._pagesize + self._pagesize
2770 if start >= nextfirstid:
2771 continue
2772
2773 startv = (
2774 start % self._pagesize
2775 if firstid <= start < nextfirstid
2776 else 0)
2777 endv = (
2778 ((end - 1) % self._pagesize) + 1
2779 if (end is not None and firstid <= end <= nextfirstid)
2780 else None)
2781
2782 try:
2783 page_results = self.getpage(pagenum)
2784 except Exception:
2785 self._pagecount = pagenum - 1
2786 raise
2787 if startv != 0 or endv is not None:
2788 page_results = page_results[startv:endv]
2789 yield from page_results
2790
2791 # A little optimization - if current page is not "full", ie. does
2792 # not contain page_size videos then we can assume that this page
2793 # is the last one - there are no more ids on further pages -
2794 # i.e. no need to query again.
2795 if len(page_results) + startv < self._pagesize:
2796 break
2797
2798 # If we got the whole page, but the next page is not interesting,
2799 # break out early as well
2800 if end == nextfirstid:
2801 break
2802
2803
2804 class InAdvancePagedList(PagedList):
2805 """PagedList with total number of pages known in advance"""
2806
2807 def __init__(self, pagefunc, pagecount, pagesize):
2808 PagedList.__init__(self, pagefunc, pagesize, True)
2809 self._pagecount = pagecount
2810
2811 def _getslice(self, start, end):
2812 start_page = start // self._pagesize
2813 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2814 skip_elems = start - start_page * self._pagesize
2815 only_more = None if end is None else end - start
2816 for pagenum in range(start_page, end_page):
2817 page_results = self.getpage(pagenum)
2818 if skip_elems:
2819 page_results = page_results[skip_elems:]
2820 skip_elems = None
2821 if only_more is not None:
2822 if len(page_results) < only_more:
2823 only_more -= len(page_results)
2824 else:
2825 yield from page_results[:only_more]
2826 break
2827 yield from page_results
2828
2829
2830 class PlaylistEntries:
2831 MissingEntry = object()
2832 is_exhausted = False
2833
2834 def __init__(self, ydl, info_dict):
2835 self.ydl = ydl
2836
2837 # _entries must be assigned now since infodict can change during iteration
2838 entries = info_dict.get('entries')
2839 if entries is None:
2840 raise EntryNotInPlaylist('There are no entries')
2841 elif isinstance(entries, list):
2842 self.is_exhausted = True
2843
2844 requested_entries = info_dict.get('requested_entries')
2845 self.is_incomplete = bool(requested_entries)
2846 if self.is_incomplete:
2847 assert self.is_exhausted
2848 self._entries = [self.MissingEntry] * max(requested_entries)
2849 for i, entry in zip(requested_entries, entries):
2850 self._entries[i - 1] = entry
2851 elif isinstance(entries, (list, PagedList, LazyList)):
2852 self._entries = entries
2853 else:
2854 self._entries = LazyList(entries)
2855
2856 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2857 (?P<start>[+-]?\d+)?
2858 (?P<range>[:-]
2859 (?P<end>[+-]?\d+|inf(?:inite)?)?
2860 (?::(?P<step>[+-]?\d+))?
2861 )?''')
2862
2863 @classmethod
2864 def parse_playlist_items(cls, string):
2865 for segment in string.split(','):
2866 if not segment:
2867 raise ValueError('There is two or more consecutive commas')
2868 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2869 if not mobj:
2870 raise ValueError(f'{segment!r} is not a valid specification')
2871 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2872 if int_or_none(step) == 0:
2873 raise ValueError(f'Step in {segment!r} cannot be zero')
2874 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2875
2876 def get_requested_items(self):
2877 playlist_items = self.ydl.params.get('playlist_items')
2878 playlist_start = self.ydl.params.get('playliststart', 1)
2879 playlist_end = self.ydl.params.get('playlistend')
2880 # For backwards compatibility, interpret -1 as whole list
2881 if playlist_end in (-1, None):
2882 playlist_end = ''
2883 if not playlist_items:
2884 playlist_items = f'{playlist_start}:{playlist_end}'
2885 elif playlist_start != 1 or playlist_end:
2886 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2887
2888 for index in self.parse_playlist_items(playlist_items):
2889 for i, entry in self[index]:
2890 yield i, entry
2891 if not entry:
2892 continue
2893 try:
2894 # TODO: Add auto-generated fields
2895 self.ydl._match_entry(entry, incomplete=True, silent=True)
2896 except (ExistingVideoReached, RejectedVideoReached):
2897 return
2898
2899 def get_full_count(self):
2900 if self.is_exhausted and not self.is_incomplete:
2901 return len(self)
2902 elif isinstance(self._entries, InAdvancePagedList):
2903 if self._entries._pagesize == 1:
2904 return self._entries._pagecount
2905
2906 @functools.cached_property
2907 def _getter(self):
2908 if isinstance(self._entries, list):
2909 def get_entry(i):
2910 try:
2911 entry = self._entries[i]
2912 except IndexError:
2913 entry = self.MissingEntry
2914 if not self.is_incomplete:
2915 raise self.IndexError()
2916 if entry is self.MissingEntry:
2917 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2918 return entry
2919 else:
2920 def get_entry(i):
2921 try:
2922 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2923 except (LazyList.IndexError, PagedList.IndexError):
2924 raise self.IndexError()
2925 return get_entry
2926
2927 def __getitem__(self, idx):
2928 if isinstance(idx, int):
2929 idx = slice(idx, idx)
2930
2931 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2932 step = 1 if idx.step is None else idx.step
2933 if idx.start is None:
2934 start = 0 if step > 0 else len(self) - 1
2935 else:
2936 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2937
2938 # NB: Do not call len(self) when idx == [:]
2939 if idx.stop is None:
2940 stop = 0 if step < 0 else float('inf')
2941 else:
2942 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2943 stop += [-1, 1][step > 0]
2944
2945 for i in frange(start, stop, step):
2946 if i < 0:
2947 continue
2948 try:
2949 entry = self._getter(i)
2950 except self.IndexError:
2951 self.is_exhausted = True
2952 if step > 0:
2953 break
2954 continue
2955 yield i + 1, entry
2956
2957 def __len__(self):
2958 return len(tuple(self[:]))
2959
2960 class IndexError(IndexError):
2961 pass
2962
2963
2964 def uppercase_escape(s):
2965 unicode_escape = codecs.getdecoder('unicode_escape')
2966 return re.sub(
2967 r'\\U[0-9a-fA-F]{8}',
2968 lambda m: unicode_escape(m.group(0))[0],
2969 s)
2970
2971
2972 def lowercase_escape(s):
2973 unicode_escape = codecs.getdecoder('unicode_escape')
2974 return re.sub(
2975 r'\\u[0-9a-fA-F]{4}',
2976 lambda m: unicode_escape(m.group(0))[0],
2977 s)
2978
2979
2980 def escape_rfc3986(s):
2981 """Escape non-ASCII characters as suggested by RFC 3986"""
2982 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2983
2984
2985 def escape_url(url):
2986 """Escape URL as suggested by RFC 3986"""
2987 url_parsed = urllib.parse.urlparse(url)
2988 return url_parsed._replace(
2989 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2990 path=escape_rfc3986(url_parsed.path),
2991 params=escape_rfc3986(url_parsed.params),
2992 query=escape_rfc3986(url_parsed.query),
2993 fragment=escape_rfc3986(url_parsed.fragment)
2994 ).geturl()
2995
2996
2997 def parse_qs(url):
2998 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
2999
3000
3001 def read_batch_urls(batch_fd):
3002 def fixup(url):
3003 if not isinstance(url, str):
3004 url = url.decode('utf-8', 'replace')
3005 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3006 for bom in BOM_UTF8:
3007 if url.startswith(bom):
3008 url = url[len(bom):]
3009 url = url.lstrip()
3010 if not url or url.startswith(('#', ';', ']')):
3011 return False
3012 # "#" cannot be stripped out since it is part of the URI
3013 # However, it can be safely stripped out if following a whitespace
3014 return re.split(r'\s#', url, 1)[0].rstrip()
3015
3016 with contextlib.closing(batch_fd) as fd:
3017 return [url for url in map(fixup, fd) if url]
3018
3019
3020 def urlencode_postdata(*args, **kargs):
3021 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3022
3023
3024 def update_url_query(url, query):
3025 if not query:
3026 return url
3027 parsed_url = urllib.parse.urlparse(url)
3028 qs = urllib.parse.parse_qs(parsed_url.query)
3029 qs.update(query)
3030 return urllib.parse.urlunparse(parsed_url._replace(
3031 query=urllib.parse.urlencode(qs, True)))
3032
3033
3034 def update_Request(req, url=None, data=None, headers=None, query=None):
3035 req_headers = req.headers.copy()
3036 req_headers.update(headers or {})
3037 req_data = data or req.data
3038 req_url = update_url_query(url or req.get_full_url(), query)
3039 req_get_method = req.get_method()
3040 if req_get_method == 'HEAD':
3041 req_type = HEADRequest
3042 elif req_get_method == 'PUT':
3043 req_type = PUTRequest
3044 else:
3045 req_type = urllib.request.Request
3046 new_req = req_type(
3047 req_url, data=req_data, headers=req_headers,
3048 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3049 if hasattr(req, 'timeout'):
3050 new_req.timeout = req.timeout
3051 return new_req
3052
3053
3054 def _multipart_encode_impl(data, boundary):
3055 content_type = 'multipart/form-data; boundary=%s' % boundary
3056
3057 out = b''
3058 for k, v in data.items():
3059 out += b'--' + boundary.encode('ascii') + b'\r\n'
3060 if isinstance(k, str):
3061 k = k.encode()
3062 if isinstance(v, str):
3063 v = v.encode()
3064 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3065 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3066 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3067 if boundary.encode('ascii') in content:
3068 raise ValueError('Boundary overlaps with data')
3069 out += content
3070
3071 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3072
3073 return out, content_type
3074
3075
3076 def multipart_encode(data, boundary=None):
3077 '''
3078 Encode a dict to RFC 7578-compliant form-data
3079
3080 data:
3081 A dict where keys and values can be either Unicode or bytes-like
3082 objects.
3083 boundary:
3084 If specified a Unicode object, it's used as the boundary. Otherwise
3085 a random boundary is generated.
3086
3087 Reference: https://tools.ietf.org/html/rfc7578
3088 '''
3089 has_specified_boundary = boundary is not None
3090
3091 while True:
3092 if boundary is None:
3093 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3094
3095 try:
3096 out, content_type = _multipart_encode_impl(data, boundary)
3097 break
3098 except ValueError:
3099 if has_specified_boundary:
3100 raise
3101 boundary = None
3102
3103 return out, content_type
3104
3105
3106 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3107 for val in map(d.get, variadic(key_or_keys)):
3108 if val is not None and (val or not skip_false_values):
3109 return val
3110 return default
3111
3112
3113 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3114 for f in funcs:
3115 try:
3116 val = f(*args, **kwargs)
3117 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3118 pass
3119 else:
3120 if expected_type is None or isinstance(val, expected_type):
3121 return val
3122
3123
3124 def try_get(src, getter, expected_type=None):
3125 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3126
3127
3128 def filter_dict(dct, cndn=lambda _, v: v is not None):
3129 return {k: v for k, v in dct.items() if cndn(k, v)}
3130
3131
3132 def merge_dicts(*dicts):
3133 merged = {}
3134 for a_dict in dicts:
3135 for k, v in a_dict.items():
3136 if (v is not None and k not in merged
3137 or isinstance(v, str) and merged[k] == ''):
3138 merged[k] = v
3139 return merged
3140
3141
3142 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3143 return string if isinstance(string, str) else str(string, encoding, errors)
3144
3145
3146 US_RATINGS = {
3147 'G': 0,
3148 'PG': 10,
3149 'PG-13': 13,
3150 'R': 16,
3151 'NC': 18,
3152 }
3153
3154
3155 TV_PARENTAL_GUIDELINES = {
3156 'TV-Y': 0,
3157 'TV-Y7': 7,
3158 'TV-G': 0,
3159 'TV-PG': 0,
3160 'TV-14': 14,
3161 'TV-MA': 17,
3162 }
3163
3164
3165 def parse_age_limit(s):
3166 # isinstance(False, int) is True. So type() must be used instead
3167 if type(s) is int: # noqa: E721
3168 return s if 0 <= s <= 21 else None
3169 elif not isinstance(s, str):
3170 return None
3171 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3172 if m:
3173 return int(m.group('age'))
3174 s = s.upper()
3175 if s in US_RATINGS:
3176 return US_RATINGS[s]
3177 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3178 if m:
3179 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3180 return None
3181
3182
3183 def strip_jsonp(code):
3184 return re.sub(
3185 r'''(?sx)^
3186 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3187 (?:\s*&&\s*(?P=func_name))?
3188 \s*\(\s*(?P<callback_data>.*)\);?
3189 \s*?(?://[^\n]*)*$''',
3190 r'\g<callback_data>', code)
3191
3192
3193 def js_to_json(code, vars={}):
3194 # vars is a dict of var, val pairs to substitute
3195 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3196 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3197 INTEGER_TABLE = (
3198 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3199 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3200 )
3201
3202 def fix_kv(m):
3203 v = m.group(0)
3204 if v in ('true', 'false', 'null'):
3205 return v
3206 elif v in ('undefined', 'void 0'):
3207 return 'null'
3208 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3209 return ""
3210
3211 if v[0] in ("'", '"'):
3212 v = re.sub(r'(?s)\\.|"', lambda m: {
3213 '"': '\\"',
3214 "\\'": "'",
3215 '\\\n': '',
3216 '\\x': '\\u00',
3217 }.get(m.group(0), m.group(0)), v[1:-1])
3218 else:
3219 for regex, base in INTEGER_TABLE:
3220 im = re.match(regex, v)
3221 if im:
3222 i = int(im.group(1), base)
3223 return '"%d":' % i if v.endswith(':') else '%d' % i
3224
3225 if v in vars:
3226 return vars[v]
3227
3228 return '"%s"' % v
3229
3230 def create_map(mobj):
3231 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3232
3233 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3234 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3235
3236 return re.sub(r'''(?sx)
3237 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3238 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3239 {comment}|,(?={skip}[\]}}])|
3240 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3241 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3242 [0-9]+(?={skip}:)|
3243 !+
3244 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3245
3246
3247 def qualities(quality_ids):
3248 """ Get a numeric quality value out of a list of possible values """
3249 def q(qid):
3250 try:
3251 return quality_ids.index(qid)
3252 except ValueError:
3253 return -1
3254 return q
3255
3256
3257 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3258
3259
3260 DEFAULT_OUTTMPL = {
3261 'default': '%(title)s [%(id)s].%(ext)s',
3262 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3263 }
3264 OUTTMPL_TYPES = {
3265 'chapter': None,
3266 'subtitle': None,
3267 'thumbnail': None,
3268 'description': 'description',
3269 'annotation': 'annotations.xml',
3270 'infojson': 'info.json',
3271 'link': None,
3272 'pl_video': None,
3273 'pl_thumbnail': None,
3274 'pl_description': 'description',
3275 'pl_infojson': 'info.json',
3276 }
3277
3278 # As of [1] format syntax is:
3279 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3280 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3281 STR_FORMAT_RE_TMPL = r'''(?x)
3282 (?<!%)(?P<prefix>(?:%%)*)
3283 %
3284 (?P<has_key>\((?P<key>{0})\))?
3285 (?P<format>
3286 (?P<conversion>[#0\-+ ]+)?
3287 (?P<min_width>\d+)?
3288 (?P<precision>\.\d+)?
3289 (?P<len_mod>[hlL])? # unused in python
3290 {1} # conversion type
3291 )
3292 '''
3293
3294
3295 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3296
3297
3298 def limit_length(s, length):
3299 """ Add ellipses to overly long strings """
3300 if s is None:
3301 return None
3302 ELLIPSES = '...'
3303 if len(s) > length:
3304 return s[:length - len(ELLIPSES)] + ELLIPSES
3305 return s
3306
3307
3308 def version_tuple(v):
3309 return tuple(int(e) for e in re.split(r'[-.]', v))
3310
3311
3312 def is_outdated_version(version, limit, assume_new=True):
3313 if not version:
3314 return not assume_new
3315 try:
3316 return version_tuple(version) < version_tuple(limit)
3317 except ValueError:
3318 return not assume_new
3319
3320
3321 def ytdl_is_updateable():
3322 """ Returns if yt-dlp can be updated with -U """
3323
3324 from .update import is_non_updateable
3325
3326 return not is_non_updateable()
3327
3328
3329 def args_to_str(args):
3330 # Get a short string representation for a subprocess command
3331 return ' '.join(compat_shlex_quote(a) for a in args)
3332
3333
3334 def error_to_compat_str(err):
3335 return str(err)
3336
3337
3338 def error_to_str(err):
3339 return f'{type(err).__name__}: {err}'
3340
3341
3342 def mimetype2ext(mt):
3343 if mt is None:
3344 return None
3345
3346 mt, _, params = mt.partition(';')
3347 mt = mt.strip()
3348
3349 FULL_MAP = {
3350 'audio/mp4': 'm4a',
3351 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3352 # it's the most popular one
3353 'audio/mpeg': 'mp3',
3354 'audio/x-wav': 'wav',
3355 'audio/wav': 'wav',
3356 'audio/wave': 'wav',
3357 }
3358
3359 ext = FULL_MAP.get(mt)
3360 if ext is not None:
3361 return ext
3362
3363 SUBTYPE_MAP = {
3364 '3gpp': '3gp',
3365 'smptett+xml': 'tt',
3366 'ttaf+xml': 'dfxp',
3367 'ttml+xml': 'ttml',
3368 'x-flv': 'flv',
3369 'x-mp4-fragmented': 'mp4',
3370 'x-ms-sami': 'sami',
3371 'x-ms-wmv': 'wmv',
3372 'mpegurl': 'm3u8',
3373 'x-mpegurl': 'm3u8',
3374 'vnd.apple.mpegurl': 'm3u8',
3375 'dash+xml': 'mpd',
3376 'f4m+xml': 'f4m',
3377 'hds+xml': 'f4m',
3378 'vnd.ms-sstr+xml': 'ism',
3379 'quicktime': 'mov',
3380 'mp2t': 'ts',
3381 'x-wav': 'wav',
3382 'filmstrip+json': 'fs',
3383 'svg+xml': 'svg',
3384 }
3385
3386 _, _, subtype = mt.rpartition('/')
3387 ext = SUBTYPE_MAP.get(subtype.lower())
3388 if ext is not None:
3389 return ext
3390
3391 SUFFIX_MAP = {
3392 'json': 'json',
3393 'xml': 'xml',
3394 'zip': 'zip',
3395 'gzip': 'gz',
3396 }
3397
3398 _, _, suffix = subtype.partition('+')
3399 ext = SUFFIX_MAP.get(suffix)
3400 if ext is not None:
3401 return ext
3402
3403 return subtype.replace('+', '.')
3404
3405
3406 def ext2mimetype(ext_or_url):
3407 if not ext_or_url:
3408 return None
3409 if '.' not in ext_or_url:
3410 ext_or_url = f'file.{ext_or_url}'
3411 return mimetypes.guess_type(ext_or_url)[0]
3412
3413
3414 def parse_codecs(codecs_str):
3415 # http://tools.ietf.org/html/rfc6381
3416 if not codecs_str:
3417 return {}
3418 split_codecs = list(filter(None, map(
3419 str.strip, codecs_str.strip().strip(',').split(','))))
3420 vcodec, acodec, scodec, hdr = None, None, None, None
3421 for full_codec in split_codecs:
3422 parts = full_codec.split('.')
3423 codec = parts[0].replace('0', '')
3424 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3425 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3426 if not vcodec:
3427 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3428 if codec in ('dvh1', 'dvhe'):
3429 hdr = 'DV'
3430 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3431 hdr = 'HDR10'
3432 elif full_codec.replace('0', '').startswith('vp9.2'):
3433 hdr = 'HDR10'
3434 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3435 if not acodec:
3436 acodec = full_codec
3437 elif codec in ('stpp', 'wvtt',):
3438 if not scodec:
3439 scodec = full_codec
3440 else:
3441 write_string(f'WARNING: Unknown codec {full_codec}\n')
3442 if vcodec or acodec or scodec:
3443 return {
3444 'vcodec': vcodec or 'none',
3445 'acodec': acodec or 'none',
3446 'dynamic_range': hdr,
3447 **({'scodec': scodec} if scodec is not None else {}),
3448 }
3449 elif len(split_codecs) == 2:
3450 return {
3451 'vcodec': split_codecs[0],
3452 'acodec': split_codecs[1],
3453 }
3454 return {}
3455
3456
3457 def urlhandle_detect_ext(url_handle):
3458 getheader = url_handle.headers.get
3459
3460 cd = getheader('Content-Disposition')
3461 if cd:
3462 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3463 if m:
3464 e = determine_ext(m.group('filename'), default_ext=None)
3465 if e:
3466 return e
3467
3468 return mimetype2ext(getheader('Content-Type'))
3469
3470
3471 def encode_data_uri(data, mime_type):
3472 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3473
3474
3475 def age_restricted(content_limit, age_limit):
3476 """ Returns True iff the content should be blocked """
3477
3478 if age_limit is None: # No limit set
3479 return False
3480 if content_limit is None:
3481 return False # Content available for everyone
3482 return age_limit < content_limit
3483
3484
3485 def is_html(first_bytes):
3486 """ Detect whether a file contains HTML by examining its first bytes. """
3487
3488 BOMS = [
3489 (b'\xef\xbb\xbf', 'utf-8'),
3490 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3491 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3492 (b'\xff\xfe', 'utf-16-le'),
3493 (b'\xfe\xff', 'utf-16-be'),
3494 ]
3495
3496 encoding = 'utf-8'
3497 for bom, enc in BOMS:
3498 while first_bytes.startswith(bom):
3499 encoding, first_bytes = enc, first_bytes[len(bom):]
3500
3501 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3502
3503
3504 def determine_protocol(info_dict):
3505 protocol = info_dict.get('protocol')
3506 if protocol is not None:
3507 return protocol
3508
3509 url = sanitize_url(info_dict['url'])
3510 if url.startswith('rtmp'):
3511 return 'rtmp'
3512 elif url.startswith('mms'):
3513 return 'mms'
3514 elif url.startswith('rtsp'):
3515 return 'rtsp'
3516
3517 ext = determine_ext(url)
3518 if ext == 'm3u8':
3519 return 'm3u8'
3520 elif ext == 'f4m':
3521 return 'f4m'
3522
3523 return urllib.parse.urlparse(url).scheme
3524
3525
3526 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3527 """ Render a list of rows, each as a list of values.
3528 Text after a \t will be right aligned """
3529 def width(string):
3530 return len(remove_terminal_sequences(string).replace('\t', ''))
3531
3532 def get_max_lens(table):
3533 return [max(width(str(v)) for v in col) for col in zip(*table)]
3534
3535 def filter_using_list(row, filterArray):
3536 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3537
3538 max_lens = get_max_lens(data) if hide_empty else []
3539 header_row = filter_using_list(header_row, max_lens)
3540 data = [filter_using_list(row, max_lens) for row in data]
3541
3542 table = [header_row] + data
3543 max_lens = get_max_lens(table)
3544 extra_gap += 1
3545 if delim:
3546 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3547 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3548 for row in table:
3549 for pos, text in enumerate(map(str, row)):
3550 if '\t' in text:
3551 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3552 else:
3553 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3554 ret = '\n'.join(''.join(row).rstrip() for row in table)
3555 return ret
3556
3557
3558 def _match_one(filter_part, dct, incomplete):
3559 # TODO: Generalize code with YoutubeDL._build_format_filter
3560 STRING_OPERATORS = {
3561 '*=': operator.contains,
3562 '^=': lambda attr, value: attr.startswith(value),
3563 '$=': lambda attr, value: attr.endswith(value),
3564 '~=': lambda attr, value: re.search(value, attr),
3565 }
3566 COMPARISON_OPERATORS = {
3567 **STRING_OPERATORS,
3568 '<=': operator.le, # "<=" must be defined above "<"
3569 '<': operator.lt,
3570 '>=': operator.ge,
3571 '>': operator.gt,
3572 '=': operator.eq,
3573 }
3574
3575 if isinstance(incomplete, bool):
3576 is_incomplete = lambda _: incomplete
3577 else:
3578 is_incomplete = lambda k: k in incomplete
3579
3580 operator_rex = re.compile(r'''(?x)
3581 (?P<key>[a-z_]+)
3582 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3583 (?:
3584 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3585 (?P<strval>.+?)
3586 )
3587 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3588 m = operator_rex.fullmatch(filter_part.strip())
3589 if m:
3590 m = m.groupdict()
3591 unnegated_op = COMPARISON_OPERATORS[m['op']]
3592 if m['negation']:
3593 op = lambda attr, value: not unnegated_op(attr, value)
3594 else:
3595 op = unnegated_op
3596 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3597 if m['quote']:
3598 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3599 actual_value = dct.get(m['key'])
3600 numeric_comparison = None
3601 if isinstance(actual_value, (int, float)):
3602 # If the original field is a string and matching comparisonvalue is
3603 # a number we should respect the origin of the original field
3604 # and process comparison value as a string (see
3605 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3606 try:
3607 numeric_comparison = int(comparison_value)
3608 except ValueError:
3609 numeric_comparison = parse_filesize(comparison_value)
3610 if numeric_comparison is None:
3611 numeric_comparison = parse_filesize(f'{comparison_value}B')
3612 if numeric_comparison is None:
3613 numeric_comparison = parse_duration(comparison_value)
3614 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3615 raise ValueError('Operator %s only supports string values!' % m['op'])
3616 if actual_value is None:
3617 return is_incomplete(m['key']) or m['none_inclusive']
3618 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3619
3620 UNARY_OPERATORS = {
3621 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3622 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3623 }
3624 operator_rex = re.compile(r'''(?x)
3625 (?P<op>%s)\s*(?P<key>[a-z_]+)
3626 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3627 m = operator_rex.fullmatch(filter_part.strip())
3628 if m:
3629 op = UNARY_OPERATORS[m.group('op')]
3630 actual_value = dct.get(m.group('key'))
3631 if is_incomplete(m.group('key')) and actual_value is None:
3632 return True
3633 return op(actual_value)
3634
3635 raise ValueError('Invalid filter part %r' % filter_part)
3636
3637
3638 def match_str(filter_str, dct, incomplete=False):
3639 """ Filter a dictionary with a simple string syntax.
3640 @returns Whether the filter passes
3641 @param incomplete Set of keys that is expected to be missing from dct.
3642 Can be True/False to indicate all/none of the keys may be missing.
3643 All conditions on incomplete keys pass if the key is missing
3644 """
3645 return all(
3646 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3647 for filter_part in re.split(r'(?<!\\)&', filter_str))
3648
3649
3650 def match_filter_func(filters):
3651 if not filters:
3652 return None
3653 filters = set(variadic(filters))
3654
3655 interactive = '-' in filters
3656 if interactive:
3657 filters.remove('-')
3658
3659 def _match_func(info_dict, incomplete=False):
3660 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3661 return NO_DEFAULT if interactive and not incomplete else None
3662 else:
3663 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3664 filter_str = ') | ('.join(map(str.strip, filters))
3665 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3666 return _match_func
3667
3668
3669 class download_range_func:
3670 def __init__(self, chapters, ranges):
3671 self.chapters, self.ranges = chapters, ranges
3672
3673 def __call__(self, info_dict, ydl):
3674 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3675 else 'Cannot match chapters since chapter information is unavailable')
3676 for regex in self.chapters or []:
3677 for i, chapter in enumerate(info_dict.get('chapters') or []):
3678 if re.search(regex, chapter['title']):
3679 warning = None
3680 yield {**chapter, 'index': i}
3681 if self.chapters and warning:
3682 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3683
3684 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3685
3686 def __eq__(self, other):
3687 return (isinstance(other, download_range_func)
3688 and self.chapters == other.chapters and self.ranges == other.ranges)
3689
3690
3691 def parse_dfxp_time_expr(time_expr):
3692 if not time_expr:
3693 return
3694
3695 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3696 if mobj:
3697 return float(mobj.group('time_offset'))
3698
3699 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3700 if mobj:
3701 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3702
3703
3704 def srt_subtitles_timecode(seconds):
3705 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3706
3707
3708 def ass_subtitles_timecode(seconds):
3709 time = timetuple_from_msec(seconds * 1000)
3710 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3711
3712
3713 def dfxp2srt(dfxp_data):
3714 '''
3715 @param dfxp_data A bytes-like object containing DFXP data
3716 @returns A unicode object containing converted SRT data
3717 '''
3718 LEGACY_NAMESPACES = (
3719 (b'http://www.w3.org/ns/ttml', [
3720 b'http://www.w3.org/2004/11/ttaf1',
3721 b'http://www.w3.org/2006/04/ttaf1',
3722 b'http://www.w3.org/2006/10/ttaf1',
3723 ]),
3724 (b'http://www.w3.org/ns/ttml#styling', [
3725 b'http://www.w3.org/ns/ttml#style',
3726 ]),
3727 )
3728
3729 SUPPORTED_STYLING = [
3730 'color',
3731 'fontFamily',
3732 'fontSize',
3733 'fontStyle',
3734 'fontWeight',
3735 'textDecoration'
3736 ]
3737
3738 _x = functools.partial(xpath_with_ns, ns_map={
3739 'xml': 'http://www.w3.org/XML/1998/namespace',
3740 'ttml': 'http://www.w3.org/ns/ttml',
3741 'tts': 'http://www.w3.org/ns/ttml#styling',
3742 })
3743
3744 styles = {}
3745 default_style = {}
3746
3747 class TTMLPElementParser:
3748 _out = ''
3749 _unclosed_elements = []
3750 _applied_styles = []
3751
3752 def start(self, tag, attrib):
3753 if tag in (_x('ttml:br'), 'br'):
3754 self._out += '\n'
3755 else:
3756 unclosed_elements = []
3757 style = {}
3758 element_style_id = attrib.get('style')
3759 if default_style:
3760 style.update(default_style)
3761 if element_style_id:
3762 style.update(styles.get(element_style_id, {}))
3763 for prop in SUPPORTED_STYLING:
3764 prop_val = attrib.get(_x('tts:' + prop))
3765 if prop_val:
3766 style[prop] = prop_val
3767 if style:
3768 font = ''
3769 for k, v in sorted(style.items()):
3770 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3771 continue
3772 if k == 'color':
3773 font += ' color="%s"' % v
3774 elif k == 'fontSize':
3775 font += ' size="%s"' % v
3776 elif k == 'fontFamily':
3777 font += ' face="%s"' % v
3778 elif k == 'fontWeight' and v == 'bold':
3779 self._out += '<b>'
3780 unclosed_elements.append('b')
3781 elif k == 'fontStyle' and v == 'italic':
3782 self._out += '<i>'
3783 unclosed_elements.append('i')
3784 elif k == 'textDecoration' and v == 'underline':
3785 self._out += '<u>'
3786 unclosed_elements.append('u')
3787 if font:
3788 self._out += '<font' + font + '>'
3789 unclosed_elements.append('font')
3790 applied_style = {}
3791 if self._applied_styles:
3792 applied_style.update(self._applied_styles[-1])
3793 applied_style.update(style)
3794 self._applied_styles.append(applied_style)
3795 self._unclosed_elements.append(unclosed_elements)
3796
3797 def end(self, tag):
3798 if tag not in (_x('ttml:br'), 'br'):
3799 unclosed_elements = self._unclosed_elements.pop()
3800 for element in reversed(unclosed_elements):
3801 self._out += '</%s>' % element
3802 if unclosed_elements and self._applied_styles:
3803 self._applied_styles.pop()
3804
3805 def data(self, data):
3806 self._out += data
3807
3808 def close(self):
3809 return self._out.strip()
3810
3811 def parse_node(node):
3812 target = TTMLPElementParser()
3813 parser = xml.etree.ElementTree.XMLParser(target=target)
3814 parser.feed(xml.etree.ElementTree.tostring(node))
3815 return parser.close()
3816
3817 for k, v in LEGACY_NAMESPACES:
3818 for ns in v:
3819 dfxp_data = dfxp_data.replace(ns, k)
3820
3821 dfxp = compat_etree_fromstring(dfxp_data)
3822 out = []
3823 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3824
3825 if not paras:
3826 raise ValueError('Invalid dfxp/TTML subtitle')
3827
3828 repeat = False
3829 while True:
3830 for style in dfxp.findall(_x('.//ttml:style')):
3831 style_id = style.get('id') or style.get(_x('xml:id'))
3832 if not style_id:
3833 continue
3834 parent_style_id = style.get('style')
3835 if parent_style_id:
3836 if parent_style_id not in styles:
3837 repeat = True
3838 continue
3839 styles[style_id] = styles[parent_style_id].copy()
3840 for prop in SUPPORTED_STYLING:
3841 prop_val = style.get(_x('tts:' + prop))
3842 if prop_val:
3843 styles.setdefault(style_id, {})[prop] = prop_val
3844 if repeat:
3845 repeat = False
3846 else:
3847 break
3848
3849 for p in ('body', 'div'):
3850 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3851 if ele is None:
3852 continue
3853 style = styles.get(ele.get('style'))
3854 if not style:
3855 continue
3856 default_style.update(style)
3857
3858 for para, index in zip(paras, itertools.count(1)):
3859 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3860 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3861 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3862 if begin_time is None:
3863 continue
3864 if not end_time:
3865 if not dur:
3866 continue
3867 end_time = begin_time + dur
3868 out.append('%d\n%s --> %s\n%s\n\n' % (
3869 index,
3870 srt_subtitles_timecode(begin_time),
3871 srt_subtitles_timecode(end_time),
3872 parse_node(para)))
3873
3874 return ''.join(out)
3875
3876
3877 def cli_option(params, command_option, param, separator=None):
3878 param = params.get(param)
3879 return ([] if param is None
3880 else [command_option, str(param)] if separator is None
3881 else [f'{command_option}{separator}{param}'])
3882
3883
3884 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3885 param = params.get(param)
3886 assert param in (True, False, None)
3887 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3888
3889
3890 def cli_valueless_option(params, command_option, param, expected_value=True):
3891 return [command_option] if params.get(param) == expected_value else []
3892
3893
3894 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3895 if isinstance(argdict, (list, tuple)): # for backward compatibility
3896 if use_compat:
3897 return argdict
3898 else:
3899 argdict = None
3900 if argdict is None:
3901 return default
3902 assert isinstance(argdict, dict)
3903
3904 assert isinstance(keys, (list, tuple))
3905 for key_list in keys:
3906 arg_list = list(filter(
3907 lambda x: x is not None,
3908 [argdict.get(key.lower()) for key in variadic(key_list)]))
3909 if arg_list:
3910 return [arg for args in arg_list for arg in args]
3911 return default
3912
3913
3914 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3915 main_key, exe = main_key.lower(), exe.lower()
3916 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3917 keys = [f'{root_key}{k}' for k in (keys or [''])]
3918 if root_key in keys:
3919 if main_key != exe:
3920 keys.append((main_key, exe))
3921 keys.append('default')
3922 else:
3923 use_compat = False
3924 return cli_configuration_args(argdict, keys, default, use_compat)
3925
3926
3927 class ISO639Utils:
3928 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3929 _lang_map = {
3930 'aa': 'aar',
3931 'ab': 'abk',
3932 'ae': 'ave',
3933 'af': 'afr',
3934 'ak': 'aka',
3935 'am': 'amh',
3936 'an': 'arg',
3937 'ar': 'ara',
3938 'as': 'asm',
3939 'av': 'ava',
3940 'ay': 'aym',
3941 'az': 'aze',
3942 'ba': 'bak',
3943 'be': 'bel',
3944 'bg': 'bul',
3945 'bh': 'bih',
3946 'bi': 'bis',
3947 'bm': 'bam',
3948 'bn': 'ben',
3949 'bo': 'bod',
3950 'br': 'bre',
3951 'bs': 'bos',
3952 'ca': 'cat',
3953 'ce': 'che',
3954 'ch': 'cha',
3955 'co': 'cos',
3956 'cr': 'cre',
3957 'cs': 'ces',
3958 'cu': 'chu',
3959 'cv': 'chv',
3960 'cy': 'cym',
3961 'da': 'dan',
3962 'de': 'deu',
3963 'dv': 'div',
3964 'dz': 'dzo',
3965 'ee': 'ewe',
3966 'el': 'ell',
3967 'en': 'eng',
3968 'eo': 'epo',
3969 'es': 'spa',
3970 'et': 'est',
3971 'eu': 'eus',
3972 'fa': 'fas',
3973 'ff': 'ful',
3974 'fi': 'fin',
3975 'fj': 'fij',
3976 'fo': 'fao',
3977 'fr': 'fra',
3978 'fy': 'fry',
3979 'ga': 'gle',
3980 'gd': 'gla',
3981 'gl': 'glg',
3982 'gn': 'grn',
3983 'gu': 'guj',
3984 'gv': 'glv',
3985 'ha': 'hau',
3986 'he': 'heb',
3987 'iw': 'heb', # Replaced by he in 1989 revision
3988 'hi': 'hin',
3989 'ho': 'hmo',
3990 'hr': 'hrv',
3991 'ht': 'hat',
3992 'hu': 'hun',
3993 'hy': 'hye',
3994 'hz': 'her',
3995 'ia': 'ina',
3996 'id': 'ind',
3997 'in': 'ind', # Replaced by id in 1989 revision
3998 'ie': 'ile',
3999 'ig': 'ibo',
4000 'ii': 'iii',
4001 'ik': 'ipk',
4002 'io': 'ido',
4003 'is': 'isl',
4004 'it': 'ita',
4005 'iu': 'iku',
4006 'ja': 'jpn',
4007 'jv': 'jav',
4008 'ka': 'kat',
4009 'kg': 'kon',
4010 'ki': 'kik',
4011 'kj': 'kua',
4012 'kk': 'kaz',
4013 'kl': 'kal',
4014 'km': 'khm',
4015 'kn': 'kan',
4016 'ko': 'kor',
4017 'kr': 'kau',
4018 'ks': 'kas',
4019 'ku': 'kur',
4020 'kv': 'kom',
4021 'kw': 'cor',
4022 'ky': 'kir',
4023 'la': 'lat',
4024 'lb': 'ltz',
4025 'lg': 'lug',
4026 'li': 'lim',
4027 'ln': 'lin',
4028 'lo': 'lao',
4029 'lt': 'lit',
4030 'lu': 'lub',
4031 'lv': 'lav',
4032 'mg': 'mlg',
4033 'mh': 'mah',
4034 'mi': 'mri',
4035 'mk': 'mkd',
4036 'ml': 'mal',
4037 'mn': 'mon',
4038 'mr': 'mar',
4039 'ms': 'msa',
4040 'mt': 'mlt',
4041 'my': 'mya',
4042 'na': 'nau',
4043 'nb': 'nob',
4044 'nd': 'nde',
4045 'ne': 'nep',
4046 'ng': 'ndo',
4047 'nl': 'nld',
4048 'nn': 'nno',
4049 'no': 'nor',
4050 'nr': 'nbl',
4051 'nv': 'nav',
4052 'ny': 'nya',
4053 'oc': 'oci',
4054 'oj': 'oji',
4055 'om': 'orm',
4056 'or': 'ori',
4057 'os': 'oss',
4058 'pa': 'pan',
4059 'pi': 'pli',
4060 'pl': 'pol',
4061 'ps': 'pus',
4062 'pt': 'por',
4063 'qu': 'que',
4064 'rm': 'roh',
4065 'rn': 'run',
4066 'ro': 'ron',
4067 'ru': 'rus',
4068 'rw': 'kin',
4069 'sa': 'san',
4070 'sc': 'srd',
4071 'sd': 'snd',
4072 'se': 'sme',
4073 'sg': 'sag',
4074 'si': 'sin',
4075 'sk': 'slk',
4076 'sl': 'slv',
4077 'sm': 'smo',
4078 'sn': 'sna',
4079 'so': 'som',
4080 'sq': 'sqi',
4081 'sr': 'srp',
4082 'ss': 'ssw',
4083 'st': 'sot',
4084 'su': 'sun',
4085 'sv': 'swe',
4086 'sw': 'swa',
4087 'ta': 'tam',
4088 'te': 'tel',
4089 'tg': 'tgk',
4090 'th': 'tha',
4091 'ti': 'tir',
4092 'tk': 'tuk',
4093 'tl': 'tgl',
4094 'tn': 'tsn',
4095 'to': 'ton',
4096 'tr': 'tur',
4097 'ts': 'tso',
4098 'tt': 'tat',
4099 'tw': 'twi',
4100 'ty': 'tah',
4101 'ug': 'uig',
4102 'uk': 'ukr',
4103 'ur': 'urd',
4104 'uz': 'uzb',
4105 've': 'ven',
4106 'vi': 'vie',
4107 'vo': 'vol',
4108 'wa': 'wln',
4109 'wo': 'wol',
4110 'xh': 'xho',
4111 'yi': 'yid',
4112 'ji': 'yid', # Replaced by yi in 1989 revision
4113 'yo': 'yor',
4114 'za': 'zha',
4115 'zh': 'zho',
4116 'zu': 'zul',
4117 }
4118
4119 @classmethod
4120 def short2long(cls, code):
4121 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4122 return cls._lang_map.get(code[:2])
4123
4124 @classmethod
4125 def long2short(cls, code):
4126 """Convert language code from ISO 639-2/T to ISO 639-1"""
4127 for short_name, long_name in cls._lang_map.items():
4128 if long_name == code:
4129 return short_name
4130
4131
4132 class ISO3166Utils:
4133 # From http://data.okfn.org/data/core/country-list
4134 _country_map = {
4135 'AF': 'Afghanistan',
4136 'AX': 'Åland Islands',
4137 'AL': 'Albania',
4138 'DZ': 'Algeria',
4139 'AS': 'American Samoa',
4140 'AD': 'Andorra',
4141 'AO': 'Angola',
4142 'AI': 'Anguilla',
4143 'AQ': 'Antarctica',
4144 'AG': 'Antigua and Barbuda',
4145 'AR': 'Argentina',
4146 'AM': 'Armenia',
4147 'AW': 'Aruba',
4148 'AU': 'Australia',
4149 'AT': 'Austria',
4150 'AZ': 'Azerbaijan',
4151 'BS': 'Bahamas',
4152 'BH': 'Bahrain',
4153 'BD': 'Bangladesh',
4154 'BB': 'Barbados',
4155 'BY': 'Belarus',
4156 'BE': 'Belgium',
4157 'BZ': 'Belize',
4158 'BJ': 'Benin',
4159 'BM': 'Bermuda',
4160 'BT': 'Bhutan',
4161 'BO': 'Bolivia, Plurinational State of',
4162 'BQ': 'Bonaire, Sint Eustatius and Saba',
4163 'BA': 'Bosnia and Herzegovina',
4164 'BW': 'Botswana',
4165 'BV': 'Bouvet Island',
4166 'BR': 'Brazil',
4167 'IO': 'British Indian Ocean Territory',
4168 'BN': 'Brunei Darussalam',
4169 'BG': 'Bulgaria',
4170 'BF': 'Burkina Faso',
4171 'BI': 'Burundi',
4172 'KH': 'Cambodia',
4173 'CM': 'Cameroon',
4174 'CA': 'Canada',
4175 'CV': 'Cape Verde',
4176 'KY': 'Cayman Islands',
4177 'CF': 'Central African Republic',
4178 'TD': 'Chad',
4179 'CL': 'Chile',
4180 'CN': 'China',
4181 'CX': 'Christmas Island',
4182 'CC': 'Cocos (Keeling) Islands',
4183 'CO': 'Colombia',
4184 'KM': 'Comoros',
4185 'CG': 'Congo',
4186 'CD': 'Congo, the Democratic Republic of the',
4187 'CK': 'Cook Islands',
4188 'CR': 'Costa Rica',
4189 'CI': 'Côte d\'Ivoire',
4190 'HR': 'Croatia',
4191 'CU': 'Cuba',
4192 'CW': 'Curaçao',
4193 'CY': 'Cyprus',
4194 'CZ': 'Czech Republic',
4195 'DK': 'Denmark',
4196 'DJ': 'Djibouti',
4197 'DM': 'Dominica',
4198 'DO': 'Dominican Republic',
4199 'EC': 'Ecuador',
4200 'EG': 'Egypt',
4201 'SV': 'El Salvador',
4202 'GQ': 'Equatorial Guinea',
4203 'ER': 'Eritrea',
4204 'EE': 'Estonia',
4205 'ET': 'Ethiopia',
4206 'FK': 'Falkland Islands (Malvinas)',
4207 'FO': 'Faroe Islands',
4208 'FJ': 'Fiji',
4209 'FI': 'Finland',
4210 'FR': 'France',
4211 'GF': 'French Guiana',
4212 'PF': 'French Polynesia',
4213 'TF': 'French Southern Territories',
4214 'GA': 'Gabon',
4215 'GM': 'Gambia',
4216 'GE': 'Georgia',
4217 'DE': 'Germany',
4218 'GH': 'Ghana',
4219 'GI': 'Gibraltar',
4220 'GR': 'Greece',
4221 'GL': 'Greenland',
4222 'GD': 'Grenada',
4223 'GP': 'Guadeloupe',
4224 'GU': 'Guam',
4225 'GT': 'Guatemala',
4226 'GG': 'Guernsey',
4227 'GN': 'Guinea',
4228 'GW': 'Guinea-Bissau',
4229 'GY': 'Guyana',
4230 'HT': 'Haiti',
4231 'HM': 'Heard Island and McDonald Islands',
4232 'VA': 'Holy See (Vatican City State)',
4233 'HN': 'Honduras',
4234 'HK': 'Hong Kong',
4235 'HU': 'Hungary',
4236 'IS': 'Iceland',
4237 'IN': 'India',
4238 'ID': 'Indonesia',
4239 'IR': 'Iran, Islamic Republic of',
4240 'IQ': 'Iraq',
4241 'IE': 'Ireland',
4242 'IM': 'Isle of Man',
4243 'IL': 'Israel',
4244 'IT': 'Italy',
4245 'JM': 'Jamaica',
4246 'JP': 'Japan',
4247 'JE': 'Jersey',
4248 'JO': 'Jordan',
4249 'KZ': 'Kazakhstan',
4250 'KE': 'Kenya',
4251 'KI': 'Kiribati',
4252 'KP': 'Korea, Democratic People\'s Republic of',
4253 'KR': 'Korea, Republic of',
4254 'KW': 'Kuwait',
4255 'KG': 'Kyrgyzstan',
4256 'LA': 'Lao People\'s Democratic Republic',
4257 'LV': 'Latvia',
4258 'LB': 'Lebanon',
4259 'LS': 'Lesotho',
4260 'LR': 'Liberia',
4261 'LY': 'Libya',
4262 'LI': 'Liechtenstein',
4263 'LT': 'Lithuania',
4264 'LU': 'Luxembourg',
4265 'MO': 'Macao',
4266 'MK': 'Macedonia, the Former Yugoslav Republic of',
4267 'MG': 'Madagascar',
4268 'MW': 'Malawi',
4269 'MY': 'Malaysia',
4270 'MV': 'Maldives',
4271 'ML': 'Mali',
4272 'MT': 'Malta',
4273 'MH': 'Marshall Islands',
4274 'MQ': 'Martinique',
4275 'MR': 'Mauritania',
4276 'MU': 'Mauritius',
4277 'YT': 'Mayotte',
4278 'MX': 'Mexico',
4279 'FM': 'Micronesia, Federated States of',
4280 'MD': 'Moldova, Republic of',
4281 'MC': 'Monaco',
4282 'MN': 'Mongolia',
4283 'ME': 'Montenegro',
4284 'MS': 'Montserrat',
4285 'MA': 'Morocco',
4286 'MZ': 'Mozambique',
4287 'MM': 'Myanmar',
4288 'NA': 'Namibia',
4289 'NR': 'Nauru',
4290 'NP': 'Nepal',
4291 'NL': 'Netherlands',
4292 'NC': 'New Caledonia',
4293 'NZ': 'New Zealand',
4294 'NI': 'Nicaragua',
4295 'NE': 'Niger',
4296 'NG': 'Nigeria',
4297 'NU': 'Niue',
4298 'NF': 'Norfolk Island',
4299 'MP': 'Northern Mariana Islands',
4300 'NO': 'Norway',
4301 'OM': 'Oman',
4302 'PK': 'Pakistan',
4303 'PW': 'Palau',
4304 'PS': 'Palestine, State of',
4305 'PA': 'Panama',
4306 'PG': 'Papua New Guinea',
4307 'PY': 'Paraguay',
4308 'PE': 'Peru',
4309 'PH': 'Philippines',
4310 'PN': 'Pitcairn',
4311 'PL': 'Poland',
4312 'PT': 'Portugal',
4313 'PR': 'Puerto Rico',
4314 'QA': 'Qatar',
4315 'RE': 'Réunion',
4316 'RO': 'Romania',
4317 'RU': 'Russian Federation',
4318 'RW': 'Rwanda',
4319 'BL': 'Saint Barthélemy',
4320 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4321 'KN': 'Saint Kitts and Nevis',
4322 'LC': 'Saint Lucia',
4323 'MF': 'Saint Martin (French part)',
4324 'PM': 'Saint Pierre and Miquelon',
4325 'VC': 'Saint Vincent and the Grenadines',
4326 'WS': 'Samoa',
4327 'SM': 'San Marino',
4328 'ST': 'Sao Tome and Principe',
4329 'SA': 'Saudi Arabia',
4330 'SN': 'Senegal',
4331 'RS': 'Serbia',
4332 'SC': 'Seychelles',
4333 'SL': 'Sierra Leone',
4334 'SG': 'Singapore',
4335 'SX': 'Sint Maarten (Dutch part)',
4336 'SK': 'Slovakia',
4337 'SI': 'Slovenia',
4338 'SB': 'Solomon Islands',
4339 'SO': 'Somalia',
4340 'ZA': 'South Africa',
4341 'GS': 'South Georgia and the South Sandwich Islands',
4342 'SS': 'South Sudan',
4343 'ES': 'Spain',
4344 'LK': 'Sri Lanka',
4345 'SD': 'Sudan',
4346 'SR': 'Suriname',
4347 'SJ': 'Svalbard and Jan Mayen',
4348 'SZ': 'Swaziland',
4349 'SE': 'Sweden',
4350 'CH': 'Switzerland',
4351 'SY': 'Syrian Arab Republic',
4352 'TW': 'Taiwan, Province of China',
4353 'TJ': 'Tajikistan',
4354 'TZ': 'Tanzania, United Republic of',
4355 'TH': 'Thailand',
4356 'TL': 'Timor-Leste',
4357 'TG': 'Togo',
4358 'TK': 'Tokelau',
4359 'TO': 'Tonga',
4360 'TT': 'Trinidad and Tobago',
4361 'TN': 'Tunisia',
4362 'TR': 'Turkey',
4363 'TM': 'Turkmenistan',
4364 'TC': 'Turks and Caicos Islands',
4365 'TV': 'Tuvalu',
4366 'UG': 'Uganda',
4367 'UA': 'Ukraine',
4368 'AE': 'United Arab Emirates',
4369 'GB': 'United Kingdom',
4370 'US': 'United States',
4371 'UM': 'United States Minor Outlying Islands',
4372 'UY': 'Uruguay',
4373 'UZ': 'Uzbekistan',
4374 'VU': 'Vanuatu',
4375 'VE': 'Venezuela, Bolivarian Republic of',
4376 'VN': 'Viet Nam',
4377 'VG': 'Virgin Islands, British',
4378 'VI': 'Virgin Islands, U.S.',
4379 'WF': 'Wallis and Futuna',
4380 'EH': 'Western Sahara',
4381 'YE': 'Yemen',
4382 'ZM': 'Zambia',
4383 'ZW': 'Zimbabwe',
4384 # Not ISO 3166 codes, but used for IP blocks
4385 'AP': 'Asia/Pacific Region',
4386 'EU': 'Europe',
4387 }
4388
4389 @classmethod
4390 def short2full(cls, code):
4391 """Convert an ISO 3166-2 country code to the corresponding full name"""
4392 return cls._country_map.get(code.upper())
4393
4394
4395 class GeoUtils:
4396 # Major IPv4 address blocks per country
4397 _country_ip_map = {
4398 'AD': '46.172.224.0/19',
4399 'AE': '94.200.0.0/13',
4400 'AF': '149.54.0.0/17',
4401 'AG': '209.59.64.0/18',
4402 'AI': '204.14.248.0/21',
4403 'AL': '46.99.0.0/16',
4404 'AM': '46.70.0.0/15',
4405 'AO': '105.168.0.0/13',
4406 'AP': '182.50.184.0/21',
4407 'AQ': '23.154.160.0/24',
4408 'AR': '181.0.0.0/12',
4409 'AS': '202.70.112.0/20',
4410 'AT': '77.116.0.0/14',
4411 'AU': '1.128.0.0/11',
4412 'AW': '181.41.0.0/18',
4413 'AX': '185.217.4.0/22',
4414 'AZ': '5.197.0.0/16',
4415 'BA': '31.176.128.0/17',
4416 'BB': '65.48.128.0/17',
4417 'BD': '114.130.0.0/16',
4418 'BE': '57.0.0.0/8',
4419 'BF': '102.178.0.0/15',
4420 'BG': '95.42.0.0/15',
4421 'BH': '37.131.0.0/17',
4422 'BI': '154.117.192.0/18',
4423 'BJ': '137.255.0.0/16',
4424 'BL': '185.212.72.0/23',
4425 'BM': '196.12.64.0/18',
4426 'BN': '156.31.0.0/16',
4427 'BO': '161.56.0.0/16',
4428 'BQ': '161.0.80.0/20',
4429 'BR': '191.128.0.0/12',
4430 'BS': '24.51.64.0/18',
4431 'BT': '119.2.96.0/19',
4432 'BW': '168.167.0.0/16',
4433 'BY': '178.120.0.0/13',
4434 'BZ': '179.42.192.0/18',
4435 'CA': '99.224.0.0/11',
4436 'CD': '41.243.0.0/16',
4437 'CF': '197.242.176.0/21',
4438 'CG': '160.113.0.0/16',
4439 'CH': '85.0.0.0/13',
4440 'CI': '102.136.0.0/14',
4441 'CK': '202.65.32.0/19',
4442 'CL': '152.172.0.0/14',
4443 'CM': '102.244.0.0/14',
4444 'CN': '36.128.0.0/10',
4445 'CO': '181.240.0.0/12',
4446 'CR': '201.192.0.0/12',
4447 'CU': '152.206.0.0/15',
4448 'CV': '165.90.96.0/19',
4449 'CW': '190.88.128.0/17',
4450 'CY': '31.153.0.0/16',
4451 'CZ': '88.100.0.0/14',
4452 'DE': '53.0.0.0/8',
4453 'DJ': '197.241.0.0/17',
4454 'DK': '87.48.0.0/12',
4455 'DM': '192.243.48.0/20',
4456 'DO': '152.166.0.0/15',
4457 'DZ': '41.96.0.0/12',
4458 'EC': '186.68.0.0/15',
4459 'EE': '90.190.0.0/15',
4460 'EG': '156.160.0.0/11',
4461 'ER': '196.200.96.0/20',
4462 'ES': '88.0.0.0/11',
4463 'ET': '196.188.0.0/14',
4464 'EU': '2.16.0.0/13',
4465 'FI': '91.152.0.0/13',
4466 'FJ': '144.120.0.0/16',
4467 'FK': '80.73.208.0/21',
4468 'FM': '119.252.112.0/20',
4469 'FO': '88.85.32.0/19',
4470 'FR': '90.0.0.0/9',
4471 'GA': '41.158.0.0/15',
4472 'GB': '25.0.0.0/8',
4473 'GD': '74.122.88.0/21',
4474 'GE': '31.146.0.0/16',
4475 'GF': '161.22.64.0/18',
4476 'GG': '62.68.160.0/19',
4477 'GH': '154.160.0.0/12',
4478 'GI': '95.164.0.0/16',
4479 'GL': '88.83.0.0/19',
4480 'GM': '160.182.0.0/15',
4481 'GN': '197.149.192.0/18',
4482 'GP': '104.250.0.0/19',
4483 'GQ': '105.235.224.0/20',
4484 'GR': '94.64.0.0/13',
4485 'GT': '168.234.0.0/16',
4486 'GU': '168.123.0.0/16',
4487 'GW': '197.214.80.0/20',
4488 'GY': '181.41.64.0/18',
4489 'HK': '113.252.0.0/14',
4490 'HN': '181.210.0.0/16',
4491 'HR': '93.136.0.0/13',
4492 'HT': '148.102.128.0/17',
4493 'HU': '84.0.0.0/14',
4494 'ID': '39.192.0.0/10',
4495 'IE': '87.32.0.0/12',
4496 'IL': '79.176.0.0/13',
4497 'IM': '5.62.80.0/20',
4498 'IN': '117.192.0.0/10',
4499 'IO': '203.83.48.0/21',
4500 'IQ': '37.236.0.0/14',
4501 'IR': '2.176.0.0/12',
4502 'IS': '82.221.0.0/16',
4503 'IT': '79.0.0.0/10',
4504 'JE': '87.244.64.0/18',
4505 'JM': '72.27.0.0/17',
4506 'JO': '176.29.0.0/16',
4507 'JP': '133.0.0.0/8',
4508 'KE': '105.48.0.0/12',
4509 'KG': '158.181.128.0/17',
4510 'KH': '36.37.128.0/17',
4511 'KI': '103.25.140.0/22',
4512 'KM': '197.255.224.0/20',
4513 'KN': '198.167.192.0/19',
4514 'KP': '175.45.176.0/22',
4515 'KR': '175.192.0.0/10',
4516 'KW': '37.36.0.0/14',
4517 'KY': '64.96.0.0/15',
4518 'KZ': '2.72.0.0/13',
4519 'LA': '115.84.64.0/18',
4520 'LB': '178.135.0.0/16',
4521 'LC': '24.92.144.0/20',
4522 'LI': '82.117.0.0/19',
4523 'LK': '112.134.0.0/15',
4524 'LR': '102.183.0.0/16',
4525 'LS': '129.232.0.0/17',
4526 'LT': '78.56.0.0/13',
4527 'LU': '188.42.0.0/16',
4528 'LV': '46.109.0.0/16',
4529 'LY': '41.252.0.0/14',
4530 'MA': '105.128.0.0/11',
4531 'MC': '88.209.64.0/18',
4532 'MD': '37.246.0.0/16',
4533 'ME': '178.175.0.0/17',
4534 'MF': '74.112.232.0/21',
4535 'MG': '154.126.0.0/17',
4536 'MH': '117.103.88.0/21',
4537 'MK': '77.28.0.0/15',
4538 'ML': '154.118.128.0/18',
4539 'MM': '37.111.0.0/17',
4540 'MN': '49.0.128.0/17',
4541 'MO': '60.246.0.0/16',
4542 'MP': '202.88.64.0/20',
4543 'MQ': '109.203.224.0/19',
4544 'MR': '41.188.64.0/18',
4545 'MS': '208.90.112.0/22',
4546 'MT': '46.11.0.0/16',
4547 'MU': '105.16.0.0/12',
4548 'MV': '27.114.128.0/18',
4549 'MW': '102.70.0.0/15',
4550 'MX': '187.192.0.0/11',
4551 'MY': '175.136.0.0/13',
4552 'MZ': '197.218.0.0/15',
4553 'NA': '41.182.0.0/16',
4554 'NC': '101.101.0.0/18',
4555 'NE': '197.214.0.0/18',
4556 'NF': '203.17.240.0/22',
4557 'NG': '105.112.0.0/12',
4558 'NI': '186.76.0.0/15',
4559 'NL': '145.96.0.0/11',
4560 'NO': '84.208.0.0/13',
4561 'NP': '36.252.0.0/15',
4562 'NR': '203.98.224.0/19',
4563 'NU': '49.156.48.0/22',
4564 'NZ': '49.224.0.0/14',
4565 'OM': '5.36.0.0/15',
4566 'PA': '186.72.0.0/15',
4567 'PE': '186.160.0.0/14',
4568 'PF': '123.50.64.0/18',
4569 'PG': '124.240.192.0/19',
4570 'PH': '49.144.0.0/13',
4571 'PK': '39.32.0.0/11',
4572 'PL': '83.0.0.0/11',
4573 'PM': '70.36.0.0/20',
4574 'PR': '66.50.0.0/16',
4575 'PS': '188.161.0.0/16',
4576 'PT': '85.240.0.0/13',
4577 'PW': '202.124.224.0/20',
4578 'PY': '181.120.0.0/14',
4579 'QA': '37.210.0.0/15',
4580 'RE': '102.35.0.0/16',
4581 'RO': '79.112.0.0/13',
4582 'RS': '93.86.0.0/15',
4583 'RU': '5.136.0.0/13',
4584 'RW': '41.186.0.0/16',
4585 'SA': '188.48.0.0/13',
4586 'SB': '202.1.160.0/19',
4587 'SC': '154.192.0.0/11',
4588 'SD': '102.120.0.0/13',
4589 'SE': '78.64.0.0/12',
4590 'SG': '8.128.0.0/10',
4591 'SI': '188.196.0.0/14',
4592 'SK': '78.98.0.0/15',
4593 'SL': '102.143.0.0/17',
4594 'SM': '89.186.32.0/19',
4595 'SN': '41.82.0.0/15',
4596 'SO': '154.115.192.0/18',
4597 'SR': '186.179.128.0/17',
4598 'SS': '105.235.208.0/21',
4599 'ST': '197.159.160.0/19',
4600 'SV': '168.243.0.0/16',
4601 'SX': '190.102.0.0/20',
4602 'SY': '5.0.0.0/16',
4603 'SZ': '41.84.224.0/19',
4604 'TC': '65.255.48.0/20',
4605 'TD': '154.68.128.0/19',
4606 'TG': '196.168.0.0/14',
4607 'TH': '171.96.0.0/13',
4608 'TJ': '85.9.128.0/18',
4609 'TK': '27.96.24.0/21',
4610 'TL': '180.189.160.0/20',
4611 'TM': '95.85.96.0/19',
4612 'TN': '197.0.0.0/11',
4613 'TO': '175.176.144.0/21',
4614 'TR': '78.160.0.0/11',
4615 'TT': '186.44.0.0/15',
4616 'TV': '202.2.96.0/19',
4617 'TW': '120.96.0.0/11',
4618 'TZ': '156.156.0.0/14',
4619 'UA': '37.52.0.0/14',
4620 'UG': '102.80.0.0/13',
4621 'US': '6.0.0.0/8',
4622 'UY': '167.56.0.0/13',
4623 'UZ': '84.54.64.0/18',
4624 'VA': '212.77.0.0/19',
4625 'VC': '207.191.240.0/21',
4626 'VE': '186.88.0.0/13',
4627 'VG': '66.81.192.0/20',
4628 'VI': '146.226.0.0/16',
4629 'VN': '14.160.0.0/11',
4630 'VU': '202.80.32.0/20',
4631 'WF': '117.20.32.0/21',
4632 'WS': '202.4.32.0/19',
4633 'YE': '134.35.0.0/16',
4634 'YT': '41.242.116.0/22',
4635 'ZA': '41.0.0.0/11',
4636 'ZM': '102.144.0.0/13',
4637 'ZW': '102.177.192.0/18',
4638 }
4639
4640 @classmethod
4641 def random_ipv4(cls, code_or_block):
4642 if len(code_or_block) == 2:
4643 block = cls._country_ip_map.get(code_or_block.upper())
4644 if not block:
4645 return None
4646 else:
4647 block = code_or_block
4648 addr, preflen = block.split('/')
4649 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4650 addr_max = addr_min | (0xffffffff >> int(preflen))
4651 return str(socket.inet_ntoa(
4652 struct.pack('!L', random.randint(addr_min, addr_max))))
4653
4654
4655 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4656 def __init__(self, proxies=None):
4657 # Set default handlers
4658 for type in ('http', 'https'):
4659 setattr(self, '%s_open' % type,
4660 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4661 meth(r, proxy, type))
4662 urllib.request.ProxyHandler.__init__(self, proxies)
4663
4664 def proxy_open(self, req, proxy, type):
4665 req_proxy = req.headers.get('Ytdl-request-proxy')
4666 if req_proxy is not None:
4667 proxy = req_proxy
4668 del req.headers['Ytdl-request-proxy']
4669
4670 if proxy == '__noproxy__':
4671 return None # No Proxy
4672 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4673 req.add_header('Ytdl-socks-proxy', proxy)
4674 # yt-dlp's http/https handlers do wrapping the socket with socks
4675 return None
4676 return urllib.request.ProxyHandler.proxy_open(
4677 self, req, proxy, type)
4678
4679
4680 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4681 # released into Public Domain
4682 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4683
4684 def long_to_bytes(n, blocksize=0):
4685 """long_to_bytes(n:long, blocksize:int) : string
4686 Convert a long integer to a byte string.
4687
4688 If optional blocksize is given and greater than zero, pad the front of the
4689 byte string with binary zeros so that the length is a multiple of
4690 blocksize.
4691 """
4692 # after much testing, this algorithm was deemed to be the fastest
4693 s = b''
4694 n = int(n)
4695 while n > 0:
4696 s = struct.pack('>I', n & 0xffffffff) + s
4697 n = n >> 32
4698 # strip off leading zeros
4699 for i in range(len(s)):
4700 if s[i] != b'\000'[0]:
4701 break
4702 else:
4703 # only happens when n == 0
4704 s = b'\000'
4705 i = 0
4706 s = s[i:]
4707 # add back some pad bytes. this could be done more efficiently w.r.t. the
4708 # de-padding being done above, but sigh...
4709 if blocksize > 0 and len(s) % blocksize:
4710 s = (blocksize - len(s) % blocksize) * b'\000' + s
4711 return s
4712
4713
4714 def bytes_to_long(s):
4715 """bytes_to_long(string) : long
4716 Convert a byte string to a long integer.
4717
4718 This is (essentially) the inverse of long_to_bytes().
4719 """
4720 acc = 0
4721 length = len(s)
4722 if length % 4:
4723 extra = (4 - length % 4)
4724 s = b'\000' * extra + s
4725 length = length + extra
4726 for i in range(0, length, 4):
4727 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4728 return acc
4729
4730
4731 def ohdave_rsa_encrypt(data, exponent, modulus):
4732 '''
4733 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4734
4735 Input:
4736 data: data to encrypt, bytes-like object
4737 exponent, modulus: parameter e and N of RSA algorithm, both integer
4738 Output: hex string of encrypted data
4739
4740 Limitation: supports one block encryption only
4741 '''
4742
4743 payload = int(binascii.hexlify(data[::-1]), 16)
4744 encrypted = pow(payload, exponent, modulus)
4745 return '%x' % encrypted
4746
4747
4748 def pkcs1pad(data, length):
4749 """
4750 Padding input data with PKCS#1 scheme
4751
4752 @param {int[]} data input data
4753 @param {int} length target length
4754 @returns {int[]} padded data
4755 """
4756 if len(data) > length - 11:
4757 raise ValueError('Input data too long for PKCS#1 padding')
4758
4759 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4760 return [0, 2] + pseudo_random + [0] + data
4761
4762
4763 def _base_n_table(n, table):
4764 if not table and not n:
4765 raise ValueError('Either table or n must be specified')
4766 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4767
4768 if n and n != len(table):
4769 raise ValueError(f'base {n} exceeds table length {len(table)}')
4770 return table
4771
4772
4773 def encode_base_n(num, n=None, table=None):
4774 """Convert given int to a base-n string"""
4775 table = _base_n_table(n, table)
4776 if not num:
4777 return table[0]
4778
4779 result, base = '', len(table)
4780 while num:
4781 result = table[num % base] + result
4782 num = num // base
4783 return result
4784
4785
4786 def decode_base_n(string, n=None, table=None):
4787 """Convert given base-n string to int"""
4788 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4789 result, base = 0, len(table)
4790 for char in string:
4791 result = result * base + table[char]
4792 return result
4793
4794
4795 def decode_base(value, digits):
4796 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4797 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4798 return decode_base_n(value, table=digits)
4799
4800
4801 def decode_packed_codes(code):
4802 mobj = re.search(PACKED_CODES_RE, code)
4803 obfuscated_code, base, count, symbols = mobj.groups()
4804 base = int(base)
4805 count = int(count)
4806 symbols = symbols.split('|')
4807 symbol_table = {}
4808
4809 while count:
4810 count -= 1
4811 base_n_count = encode_base_n(count, base)
4812 symbol_table[base_n_count] = symbols[count] or base_n_count
4813
4814 return re.sub(
4815 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4816 obfuscated_code)
4817
4818
4819 def caesar(s, alphabet, shift):
4820 if shift == 0:
4821 return s
4822 l = len(alphabet)
4823 return ''.join(
4824 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4825 for c in s)
4826
4827
4828 def rot47(s):
4829 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4830
4831
4832 def parse_m3u8_attributes(attrib):
4833 info = {}
4834 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4835 if val.startswith('"'):
4836 val = val[1:-1]
4837 info[key] = val
4838 return info
4839
4840
4841 def urshift(val, n):
4842 return val >> n if val >= 0 else (val + 0x100000000) >> n
4843
4844
4845 # Based on png2str() written by @gdkchan and improved by @yokrysty
4846 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4847 def decode_png(png_data):
4848 # Reference: https://www.w3.org/TR/PNG/
4849 header = png_data[8:]
4850
4851 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4852 raise OSError('Not a valid PNG file.')
4853
4854 int_map = {1: '>B', 2: '>H', 4: '>I'}
4855 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4856
4857 chunks = []
4858
4859 while header:
4860 length = unpack_integer(header[:4])
4861 header = header[4:]
4862
4863 chunk_type = header[:4]
4864 header = header[4:]
4865
4866 chunk_data = header[:length]
4867 header = header[length:]
4868
4869 header = header[4:] # Skip CRC
4870
4871 chunks.append({
4872 'type': chunk_type,
4873 'length': length,
4874 'data': chunk_data
4875 })
4876
4877 ihdr = chunks[0]['data']
4878
4879 width = unpack_integer(ihdr[:4])
4880 height = unpack_integer(ihdr[4:8])
4881
4882 idat = b''
4883
4884 for chunk in chunks:
4885 if chunk['type'] == b'IDAT':
4886 idat += chunk['data']
4887
4888 if not idat:
4889 raise OSError('Unable to read PNG data.')
4890
4891 decompressed_data = bytearray(zlib.decompress(idat))
4892
4893 stride = width * 3
4894 pixels = []
4895
4896 def _get_pixel(idx):
4897 x = idx % stride
4898 y = idx // stride
4899 return pixels[y][x]
4900
4901 for y in range(height):
4902 basePos = y * (1 + stride)
4903 filter_type = decompressed_data[basePos]
4904
4905 current_row = []
4906
4907 pixels.append(current_row)
4908
4909 for x in range(stride):
4910 color = decompressed_data[1 + basePos + x]
4911 basex = y * stride + x
4912 left = 0
4913 up = 0
4914
4915 if x > 2:
4916 left = _get_pixel(basex - 3)
4917 if y > 0:
4918 up = _get_pixel(basex - stride)
4919
4920 if filter_type == 1: # Sub
4921 color = (color + left) & 0xff
4922 elif filter_type == 2: # Up
4923 color = (color + up) & 0xff
4924 elif filter_type == 3: # Average
4925 color = (color + ((left + up) >> 1)) & 0xff
4926 elif filter_type == 4: # Paeth
4927 a = left
4928 b = up
4929 c = 0
4930
4931 if x > 2 and y > 0:
4932 c = _get_pixel(basex - stride - 3)
4933
4934 p = a + b - c
4935
4936 pa = abs(p - a)
4937 pb = abs(p - b)
4938 pc = abs(p - c)
4939
4940 if pa <= pb and pa <= pc:
4941 color = (color + a) & 0xff
4942 elif pb <= pc:
4943 color = (color + b) & 0xff
4944 else:
4945 color = (color + c) & 0xff
4946
4947 current_row.append(color)
4948
4949 return width, height, pixels
4950
4951
4952 def write_xattr(path, key, value):
4953 # Windows: Write xattrs to NTFS Alternate Data Streams:
4954 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4955 if compat_os_name == 'nt':
4956 assert ':' not in key
4957 assert os.path.exists(path)
4958
4959 try:
4960 with open(f'{path}:{key}', 'wb') as f:
4961 f.write(value)
4962 except OSError as e:
4963 raise XAttrMetadataError(e.errno, e.strerror)
4964 return
4965
4966 # UNIX Method 1. Use xattrs/pyxattrs modules
4967
4968 setxattr = None
4969 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4970 # Unicode arguments are not supported in pyxattr until version 0.5.0
4971 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4972 if version_tuple(xattr.__version__) >= (0, 5, 0):
4973 setxattr = xattr.set
4974 elif xattr:
4975 setxattr = xattr.setxattr
4976
4977 if setxattr:
4978 try:
4979 setxattr(path, key, value)
4980 except OSError as e:
4981 raise XAttrMetadataError(e.errno, e.strerror)
4982 return
4983
4984 # UNIX Method 2. Use setfattr/xattr executables
4985 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4986 else 'xattr' if check_executable('xattr', ['-h']) else None)
4987 if not exe:
4988 raise XAttrUnavailableError(
4989 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4990 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4991
4992 value = value.decode()
4993 try:
4994 _, stderr, returncode = Popen.run(
4995 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4996 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4997 except OSError as e:
4998 raise XAttrMetadataError(e.errno, e.strerror)
4999 if returncode:
5000 raise XAttrMetadataError(returncode, stderr)
5001
5002
5003 def random_birthday(year_field, month_field, day_field):
5004 start_date = datetime.date(1950, 1, 1)
5005 end_date = datetime.date(1995, 12, 31)
5006 offset = random.randint(0, (end_date - start_date).days)
5007 random_date = start_date + datetime.timedelta(offset)
5008 return {
5009 year_field: str(random_date.year),
5010 month_field: str(random_date.month),
5011 day_field: str(random_date.day),
5012 }
5013
5014
5015 # Templates for internet shortcut files, which are plain text files.
5016 DOT_URL_LINK_TEMPLATE = '''\
5017 [InternetShortcut]
5018 URL=%(url)s
5019 '''
5020
5021 DOT_WEBLOC_LINK_TEMPLATE = '''\
5022 <?xml version="1.0" encoding="UTF-8"?>
5023 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5024 <plist version="1.0">
5025 <dict>
5026 \t<key>URL</key>
5027 \t<string>%(url)s</string>
5028 </dict>
5029 </plist>
5030 '''
5031
5032 DOT_DESKTOP_LINK_TEMPLATE = '''\
5033 [Desktop Entry]
5034 Encoding=UTF-8
5035 Name=%(filename)s
5036 Type=Link
5037 URL=%(url)s
5038 Icon=text-html
5039 '''
5040
5041 LINK_TEMPLATES = {
5042 'url': DOT_URL_LINK_TEMPLATE,
5043 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5044 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5045 }
5046
5047
5048 def iri_to_uri(iri):
5049 """
5050 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5051
5052 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5053 """
5054
5055 iri_parts = urllib.parse.urlparse(iri)
5056
5057 if '[' in iri_parts.netloc:
5058 raise ValueError('IPv6 URIs are not, yet, supported.')
5059 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5060
5061 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5062
5063 net_location = ''
5064 if iri_parts.username:
5065 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5066 if iri_parts.password is not None:
5067 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5068 net_location += '@'
5069
5070 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5071 # The 'idna' encoding produces ASCII text.
5072 if iri_parts.port is not None and iri_parts.port != 80:
5073 net_location += ':' + str(iri_parts.port)
5074
5075 return urllib.parse.urlunparse(
5076 (iri_parts.scheme,
5077 net_location,
5078
5079 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5080
5081 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5082 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5083
5084 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5085 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5086
5087 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5088
5089 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5090
5091
5092 def to_high_limit_path(path):
5093 if sys.platform in ['win32', 'cygwin']:
5094 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5095 return '\\\\?\\' + os.path.abspath(path)
5096
5097 return path
5098
5099
5100 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5101 val = traverse_obj(obj, *variadic(field))
5102 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5103 return default
5104 return template % func(val)
5105
5106
5107 def clean_podcast_url(url):
5108 return re.sub(r'''(?x)
5109 (?:
5110 (?:
5111 chtbl\.com/track|
5112 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5113 play\.podtrac\.com
5114 )/[^/]+|
5115 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5116 flex\.acast\.com|
5117 pd(?:
5118 cn\.co| # https://podcorn.com/analytics-prefix/
5119 st\.fm # https://podsights.com/docs/
5120 )/e
5121 )/''', '', url)
5122
5123
5124 _HEX_TABLE = '0123456789abcdef'
5125
5126
5127 def random_uuidv4():
5128 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5129
5130
5131 def make_dir(path, to_screen=None):
5132 try:
5133 dn = os.path.dirname(path)
5134 if dn and not os.path.exists(dn):
5135 os.makedirs(dn)
5136 return True
5137 except OSError as err:
5138 if callable(to_screen) is not None:
5139 to_screen('unable to create directory ' + error_to_compat_str(err))
5140 return False
5141
5142
5143 def get_executable_path():
5144 from .update import _get_variant_and_executable_path
5145
5146 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5147
5148
5149 def load_plugins(name, suffix, namespace):
5150 classes = {}
5151 with contextlib.suppress(FileNotFoundError):
5152 plugins_spec = importlib.util.spec_from_file_location(
5153 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5154 plugins = importlib.util.module_from_spec(plugins_spec)
5155 sys.modules[plugins_spec.name] = plugins
5156 plugins_spec.loader.exec_module(plugins)
5157 for name in dir(plugins):
5158 if name in namespace:
5159 continue
5160 if not name.endswith(suffix):
5161 continue
5162 klass = getattr(plugins, name)
5163 classes[name] = namespace[name] = klass
5164 return classes
5165
5166
5167 def traverse_obj(
5168 obj, *path_list, default=None, expected_type=None, get_all=True,
5169 casesense=True, is_user_input=False, traverse_string=False):
5170 ''' Traverse nested list/dict/tuple
5171 @param path_list A list of paths which are checked one by one.
5172 Each path is a list of keys where each key is a:
5173 - None: Do nothing
5174 - string: A dictionary key
5175 - int: An index into a list
5176 - tuple: A list of keys all of which will be traversed
5177 - Ellipsis: Fetch all values in the object
5178 - Function: Takes the key and value as arguments
5179 and returns whether the key matches or not
5180 @param default Default value to return
5181 @param expected_type Only accept final value of this type (Can also be any callable)
5182 @param get_all Return all the values obtained from a path or only the first one
5183 @param casesense Whether to consider dictionary keys as case sensitive
5184 @param is_user_input Whether the keys are generated from user input. If True,
5185 strings are converted to int/slice if necessary
5186 @param traverse_string Whether to traverse inside strings. If True, any
5187 non-compatible object will also be converted into a string
5188 # TODO: Write tests
5189 '''
5190 if not casesense:
5191 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5192 path_list = (map(_lower, variadic(path)) for path in path_list)
5193
5194 def _traverse_obj(obj, path, _current_depth=0):
5195 nonlocal depth
5196 path = tuple(variadic(path))
5197 for i, key in enumerate(path):
5198 if None in (key, obj):
5199 return obj
5200 if isinstance(key, (list, tuple)):
5201 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5202 key = ...
5203 if key is ...:
5204 obj = (obj.values() if isinstance(obj, dict)
5205 else obj if isinstance(obj, (list, tuple, LazyList))
5206 else str(obj) if traverse_string else [])
5207 _current_depth += 1
5208 depth = max(depth, _current_depth)
5209 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5210 elif callable(key):
5211 if isinstance(obj, (list, tuple, LazyList)):
5212 obj = enumerate(obj)
5213 elif isinstance(obj, dict):
5214 obj = obj.items()
5215 else:
5216 if not traverse_string:
5217 return None
5218 obj = str(obj)
5219 _current_depth += 1
5220 depth = max(depth, _current_depth)
5221 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5222 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5223 obj = (obj.get(key) if casesense or (key in obj)
5224 else next((v for k, v in obj.items() if _lower(k) == key), None))
5225 else:
5226 if is_user_input:
5227 key = (int_or_none(key) if ':' not in key
5228 else slice(*map(int_or_none, key.split(':'))))
5229 if key == slice(None):
5230 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5231 if not isinstance(key, (int, slice)):
5232 return None
5233 if not isinstance(obj, (list, tuple, LazyList)):
5234 if not traverse_string:
5235 return None
5236 obj = str(obj)
5237 try:
5238 obj = obj[key]
5239 except IndexError:
5240 return None
5241 return obj
5242
5243 if isinstance(expected_type, type):
5244 type_test = lambda val: val if isinstance(val, expected_type) else None
5245 else:
5246 type_test = expected_type or IDENTITY
5247
5248 for path in path_list:
5249 depth = 0
5250 val = _traverse_obj(obj, path)
5251 if val is not None:
5252 if depth:
5253 for _ in range(depth - 1):
5254 val = itertools.chain.from_iterable(v for v in val if v is not None)
5255 val = [v for v in map(type_test, val) if v is not None]
5256 if val:
5257 return val if get_all else val[0]
5258 else:
5259 val = type_test(val)
5260 if val is not None:
5261 return val
5262 return default
5263
5264
5265 def traverse_dict(dictn, keys, casesense=True):
5266 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5267 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5268 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5269
5270
5271 def get_first(obj, keys, **kwargs):
5272 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5273
5274
5275 def variadic(x, allowed_types=(str, bytes, dict)):
5276 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5277
5278
5279 def time_seconds(**kwargs):
5280 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5281 return t.timestamp()
5282
5283
5284 # create a JSON Web Signature (jws) with HS256 algorithm
5285 # the resulting format is in JWS Compact Serialization
5286 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5287 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5288 def jwt_encode_hs256(payload_data, key, headers={}):
5289 header_data = {
5290 'alg': 'HS256',
5291 'typ': 'JWT',
5292 }
5293 if headers:
5294 header_data.update(headers)
5295 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5296 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5297 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5298 signature_b64 = base64.b64encode(h.digest())
5299 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5300 return token
5301
5302
5303 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5304 def jwt_decode_hs256(jwt):
5305 header_b64, payload_b64, signature_b64 = jwt.split('.')
5306 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5307 return payload_data
5308
5309
5310 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5311
5312
5313 @functools.cache
5314 def supports_terminal_sequences(stream):
5315 if compat_os_name == 'nt':
5316 if not WINDOWS_VT_MODE:
5317 return False
5318 elif not os.getenv('TERM'):
5319 return False
5320 try:
5321 return stream.isatty()
5322 except BaseException:
5323 return False
5324
5325
5326 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5327 if get_windows_version() < (10, 0, 10586):
5328 return
5329 global WINDOWS_VT_MODE
5330 try:
5331 Popen.run('', shell=True)
5332 except Exception:
5333 return
5334
5335 WINDOWS_VT_MODE = True
5336 supports_terminal_sequences.cache_clear()
5337
5338
5339 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5340
5341
5342 def remove_terminal_sequences(string):
5343 return _terminal_sequences_re.sub('', string)
5344
5345
5346 def number_of_digits(number):
5347 return len('%d' % number)
5348
5349
5350 def join_nonempty(*values, delim='-', from_dict=None):
5351 if from_dict is not None:
5352 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5353 return delim.join(map(str, filter(None, values)))
5354
5355
5356 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5357 """
5358 Find the largest format dimensions in terms of video width and, for each thumbnail:
5359 * Modify the URL: Match the width with the provided regex and replace with the former width
5360 * Update dimensions
5361
5362 This function is useful with video services that scale the provided thumbnails on demand
5363 """
5364 _keys = ('width', 'height')
5365 max_dimensions = max(
5366 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5367 default=(0, 0))
5368 if not max_dimensions[0]:
5369 return thumbnails
5370 return [
5371 merge_dicts(
5372 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5373 dict(zip(_keys, max_dimensions)), thumbnail)
5374 for thumbnail in thumbnails
5375 ]
5376
5377
5378 def parse_http_range(range):
5379 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5380 if not range:
5381 return None, None, None
5382 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5383 if not crg:
5384 return None, None, None
5385 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5386
5387
5388 def read_stdin(what):
5389 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5390 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5391 return sys.stdin
5392
5393
5394 class Config:
5395 own_args = None
5396 parsed_args = None
5397 filename = None
5398 __initialized = False
5399
5400 def __init__(self, parser, label=None):
5401 self.parser, self.label = parser, label
5402 self._loaded_paths, self.configs = set(), []
5403
5404 def init(self, args=None, filename=None):
5405 assert not self.__initialized
5406 self.own_args, self.filename = args, filename
5407 return self.load_configs()
5408
5409 def load_configs(self):
5410 directory = ''
5411 if self.filename:
5412 location = os.path.realpath(self.filename)
5413 directory = os.path.dirname(location)
5414 if location in self._loaded_paths:
5415 return False
5416 self._loaded_paths.add(location)
5417
5418 self.__initialized = True
5419 opts, _ = self.parser.parse_known_args(self.own_args)
5420 self.parsed_args = self.own_args
5421 for location in opts.config_locations or []:
5422 if location == '-':
5423 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5424 continue
5425 location = os.path.join(directory, expand_path(location))
5426 if os.path.isdir(location):
5427 location = os.path.join(location, 'yt-dlp.conf')
5428 if not os.path.exists(location):
5429 self.parser.error(f'config location {location} does not exist')
5430 self.append_config(self.read_file(location), location)
5431 return True
5432
5433 def __str__(self):
5434 label = join_nonempty(
5435 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5436 delim=' ')
5437 return join_nonempty(
5438 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5439 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5440 delim='\n')
5441
5442 @staticmethod
5443 def read_file(filename, default=[]):
5444 try:
5445 optionf = open(filename)
5446 except OSError:
5447 return default # silently skip if file is not present
5448 try:
5449 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5450 contents = optionf.read()
5451 res = shlex.split(contents, comments=True)
5452 except Exception as err:
5453 raise ValueError(f'Unable to parse "{filename}": {err}')
5454 finally:
5455 optionf.close()
5456 return res
5457
5458 @staticmethod
5459 def hide_login_info(opts):
5460 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5461 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5462
5463 def _scrub_eq(o):
5464 m = eqre.match(o)
5465 if m:
5466 return m.group('key') + '=PRIVATE'
5467 else:
5468 return o
5469
5470 opts = list(map(_scrub_eq, opts))
5471 for idx, opt in enumerate(opts):
5472 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5473 opts[idx + 1] = 'PRIVATE'
5474 return opts
5475
5476 def append_config(self, *args, label=None):
5477 config = type(self)(self.parser, label)
5478 config._loaded_paths = self._loaded_paths
5479 if config.init(*args):
5480 self.configs.append(config)
5481
5482 @property
5483 def all_args(self):
5484 for config in reversed(self.configs):
5485 yield from config.all_args
5486 yield from self.parsed_args or []
5487
5488 def parse_known_args(self, **kwargs):
5489 return self.parser.parse_known_args(self.all_args, **kwargs)
5490
5491 def parse_args(self):
5492 return self.parser.parse_args(self.all_args)
5493
5494
5495 class WebSocketsWrapper():
5496 """Wraps websockets module to use in non-async scopes"""
5497 pool = None
5498
5499 def __init__(self, url, headers=None, connect=True):
5500 self.loop = asyncio.new_event_loop()
5501 # XXX: "loop" is deprecated
5502 self.conn = websockets.connect(
5503 url, extra_headers=headers, ping_interval=None,
5504 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5505 if connect:
5506 self.__enter__()
5507 atexit.register(self.__exit__, None, None, None)
5508
5509 def __enter__(self):
5510 if not self.pool:
5511 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5512 return self
5513
5514 def send(self, *args):
5515 self.run_with_loop(self.pool.send(*args), self.loop)
5516
5517 def recv(self, *args):
5518 return self.run_with_loop(self.pool.recv(*args), self.loop)
5519
5520 def __exit__(self, type, value, traceback):
5521 try:
5522 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5523 finally:
5524 self.loop.close()
5525 self._cancel_all_tasks(self.loop)
5526
5527 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5528 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5529 @staticmethod
5530 def run_with_loop(main, loop):
5531 if not asyncio.iscoroutine(main):
5532 raise ValueError(f'a coroutine was expected, got {main!r}')
5533
5534 try:
5535 return loop.run_until_complete(main)
5536 finally:
5537 loop.run_until_complete(loop.shutdown_asyncgens())
5538 if hasattr(loop, 'shutdown_default_executor'):
5539 loop.run_until_complete(loop.shutdown_default_executor())
5540
5541 @staticmethod
5542 def _cancel_all_tasks(loop):
5543 to_cancel = asyncio.all_tasks(loop)
5544
5545 if not to_cancel:
5546 return
5547
5548 for task in to_cancel:
5549 task.cancel()
5550
5551 # XXX: "loop" is removed in python 3.10+
5552 loop.run_until_complete(
5553 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5554
5555 for task in to_cancel:
5556 if task.cancelled():
5557 continue
5558 if task.exception() is not None:
5559 loop.call_exception_handler({
5560 'message': 'unhandled exception during asyncio.run() shutdown',
5561 'exception': task.exception(),
5562 'task': task,
5563 })
5564
5565
5566 def merge_headers(*dicts):
5567 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5568 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5569
5570
5571 def cached_method(f):
5572 """Cache a method"""
5573 signature = inspect.signature(f)
5574
5575 @functools.wraps(f)
5576 def wrapper(self, *args, **kwargs):
5577 bound_args = signature.bind(self, *args, **kwargs)
5578 bound_args.apply_defaults()
5579 key = tuple(bound_args.arguments.values())
5580
5581 if not hasattr(self, '__cached_method__cache'):
5582 self.__cached_method__cache = {}
5583 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5584 if key not in cache:
5585 cache[key] = f(self, *args, **kwargs)
5586 return cache[key]
5587 return wrapper
5588
5589
5590 class classproperty:
5591 """property access for class methods"""
5592
5593 def __init__(self, func):
5594 functools.update_wrapper(self, func)
5595 self.func = func
5596
5597 def __get__(self, _, cls):
5598 return self.func(cls)
5599
5600
5601 class Namespace(types.SimpleNamespace):
5602 """Immutable namespace"""
5603
5604 def __iter__(self):
5605 return iter(self.__dict__.values())
5606
5607 @property
5608 def items_(self):
5609 return self.__dict__.items()
5610
5611
5612 # Deprecated
5613 has_certifi = bool(certifi)
5614 has_websockets = bool(websockets)