]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[docs] Improvements
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import ctypes
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import importlib.util
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import mimetypes
24 import operator
25 import os
26 import platform
27 import random
28 import re
29 import shlex
30 import socket
31 import ssl
32 import subprocess
33 import sys
34 import tempfile
35 import time
36 import traceback
37 import types
38 import urllib.parse
39 import xml.etree.ElementTree
40 import zlib
41
42 from .compat import asyncio, functools # isort: split
43 from .compat import (
44 compat_chr,
45 compat_cookiejar,
46 compat_etree_fromstring,
47 compat_expanduser,
48 compat_html_entities,
49 compat_html_entities_html5,
50 compat_HTMLParseError,
51 compat_HTMLParser,
52 compat_http_client,
53 compat_HTTPError,
54 compat_os_name,
55 compat_parse_qs,
56 compat_shlex_quote,
57 compat_str,
58 compat_struct_pack,
59 compat_struct_unpack,
60 compat_urllib_error,
61 compat_urllib_parse_unquote_plus,
62 compat_urllib_parse_urlencode,
63 compat_urllib_parse_urlparse,
64 compat_urllib_request,
65 compat_urlparse,
66 )
67 from .dependencies import brotli, certifi, websockets
68 from .socks import ProxyType, sockssocket
69
70
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
80 # This is not clearly defined otherwise
81 compiled_regex_type = type(re.compile(''))
82
83
84 def random_user_agent():
85 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
86 _CHROME_VERSIONS = (
87 '90.0.4430.212',
88 '90.0.4430.24',
89 '90.0.4430.70',
90 '90.0.4430.72',
91 '90.0.4430.85',
92 '90.0.4430.93',
93 '91.0.4472.101',
94 '91.0.4472.106',
95 '91.0.4472.114',
96 '91.0.4472.124',
97 '91.0.4472.164',
98 '91.0.4472.19',
99 '91.0.4472.77',
100 '92.0.4515.107',
101 '92.0.4515.115',
102 '92.0.4515.131',
103 '92.0.4515.159',
104 '92.0.4515.43',
105 '93.0.4556.0',
106 '93.0.4577.15',
107 '93.0.4577.63',
108 '93.0.4577.82',
109 '94.0.4606.41',
110 '94.0.4606.54',
111 '94.0.4606.61',
112 '94.0.4606.71',
113 '94.0.4606.81',
114 '94.0.4606.85',
115 '95.0.4638.17',
116 '95.0.4638.50',
117 '95.0.4638.54',
118 '95.0.4638.69',
119 '95.0.4638.74',
120 '96.0.4664.18',
121 '96.0.4664.45',
122 '96.0.4664.55',
123 '96.0.4664.93',
124 '97.0.4692.20',
125 )
126 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
127
128
129 SUPPORTED_ENCODINGS = [
130 'gzip', 'deflate'
131 ]
132 if brotli:
133 SUPPORTED_ENCODINGS.append('br')
134
135 std_headers = {
136 'User-Agent': random_user_agent(),
137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
138 'Accept-Language': 'en-us,en;q=0.5',
139 'Sec-Fetch-Mode': 'navigate',
140 }
141
142
143 USER_AGENTS = {
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
145 }
146
147
148 NO_DEFAULT = object()
149
150 ENGLISH_MONTH_NAMES = [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
153
154 MONTH_NAMES = {
155 'en': ENGLISH_MONTH_NAMES,
156 'fr': [
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
159 }
160
161 KNOWN_EXTENSIONS = (
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
166 'avi', 'divx',
167 'mov',
168 'asf', 'wmv', 'wma',
169 '3gp', '3g2',
170 'mp3',
171 'flac',
172 'ape',
173 'wav',
174 'f4f', 'f4m', 'm3u8', 'smil')
175
176 # needed for sanitizing filenames in restricted mode
177 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
178 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
180
181 DATE_FORMATS = (
182 '%d %B %Y',
183 '%d %b %Y',
184 '%B %d %Y',
185 '%B %dst %Y',
186 '%B %dnd %Y',
187 '%B %drd %Y',
188 '%B %dth %Y',
189 '%b %d %Y',
190 '%b %dst %Y',
191 '%b %dnd %Y',
192 '%b %drd %Y',
193 '%b %dth %Y',
194 '%b %dst %Y %I:%M',
195 '%b %dnd %Y %I:%M',
196 '%b %drd %Y %I:%M',
197 '%b %dth %Y %I:%M',
198 '%Y %m %d',
199 '%Y-%m-%d',
200 '%Y.%m.%d.',
201 '%Y/%m/%d',
202 '%Y/%m/%d %H:%M',
203 '%Y/%m/%d %H:%M:%S',
204 '%Y%m%d%H%M',
205 '%Y%m%d%H%M%S',
206 '%Y%m%d',
207 '%Y-%m-%d %H:%M',
208 '%Y-%m-%d %H:%M:%S',
209 '%Y-%m-%d %H:%M:%S.%f',
210 '%Y-%m-%d %H:%M:%S:%f',
211 '%d.%m.%Y %H:%M',
212 '%d.%m.%Y %H.%M',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S',
217 '%Y-%m-%dT%H:%M:%S.%f',
218 '%Y-%m-%dT%H:%M',
219 '%b %d %Y at %H:%M',
220 '%b %d %Y at %H:%M:%S',
221 '%B %d %Y at %H:%M',
222 '%B %d %Y at %H:%M:%S',
223 '%H:%M %d-%b-%Y',
224 )
225
226 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
227 DATE_FORMATS_DAY_FIRST.extend([
228 '%d-%m-%Y',
229 '%d.%m.%Y',
230 '%d.%m.%y',
231 '%d/%m/%Y',
232 '%d/%m/%y',
233 '%d/%m/%Y %H:%M:%S',
234 ])
235
236 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
237 DATE_FORMATS_MONTH_FIRST.extend([
238 '%m-%d-%Y',
239 '%m.%d.%Y',
240 '%m/%d/%Y',
241 '%m/%d/%y',
242 '%m/%d/%Y %H:%M:%S',
243 ])
244
245 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
246 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
247
248 NUMBER_RE = r'\d+(?:\.\d+)?'
249
250
251 @functools.cache
252 def preferredencoding():
253 """Get preferred encoding.
254
255 Returns the best encoding scheme for the system, based on
256 locale.getpreferredencoding() and some further tweaks.
257 """
258 try:
259 pref = locale.getpreferredencoding()
260 'TEST'.encode(pref)
261 except Exception:
262 pref = 'UTF-8'
263
264 return pref
265
266
267 def write_json_file(obj, fn):
268 """ Encode obj as JSON and write it to fn, atomically if possible """
269
270 tf = tempfile.NamedTemporaryFile(
271 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
272 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
273
274 try:
275 with tf:
276 json.dump(obj, tf, ensure_ascii=False)
277 if sys.platform == 'win32':
278 # Need to remove existing file on Windows, else os.rename raises
279 # WindowsError or FileExistsError.
280 with contextlib.suppress(OSError):
281 os.unlink(fn)
282 with contextlib.suppress(OSError):
283 mask = os.umask(0)
284 os.umask(mask)
285 os.chmod(tf.name, 0o666 & ~mask)
286 os.rename(tf.name, fn)
287 except Exception:
288 with contextlib.suppress(OSError):
289 os.remove(tf.name)
290 raise
291
292
293 def find_xpath_attr(node, xpath, key, val=None):
294 """ Find the xpath xpath[@key=val] """
295 assert re.match(r'^[a-zA-Z_-]+$', key)
296 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
297 return node.find(expr)
298
299 # On python2.6 the xml.etree.ElementTree.Element methods don't support
300 # the namespace parameter
301
302
303 def xpath_with_ns(path, ns_map):
304 components = [c.split(':') for c in path.split('/')]
305 replaced = []
306 for c in components:
307 if len(c) == 1:
308 replaced.append(c[0])
309 else:
310 ns, tag = c
311 replaced.append('{%s}%s' % (ns_map[ns], tag))
312 return '/'.join(replaced)
313
314
315 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
316 def _find_xpath(xpath):
317 return node.find(xpath)
318
319 if isinstance(xpath, (str, compat_str)):
320 n = _find_xpath(xpath)
321 else:
322 for xp in xpath:
323 n = _find_xpath(xp)
324 if n is not None:
325 break
326
327 if n is None:
328 if default is not NO_DEFAULT:
329 return default
330 elif fatal:
331 name = xpath if name is None else name
332 raise ExtractorError('Could not find XML element %s' % name)
333 else:
334 return None
335 return n
336
337
338 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
339 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
340 if n is None or n == default:
341 return n
342 if n.text is None:
343 if default is not NO_DEFAULT:
344 return default
345 elif fatal:
346 name = xpath if name is None else name
347 raise ExtractorError('Could not find XML element\'s text %s' % name)
348 else:
349 return None
350 return n.text
351
352
353 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
354 n = find_xpath_attr(node, xpath, key)
355 if n is None:
356 if default is not NO_DEFAULT:
357 return default
358 elif fatal:
359 name = f'{xpath}[@{key}]' if name is None else name
360 raise ExtractorError('Could not find XML attribute %s' % name)
361 else:
362 return None
363 return n.attrib[key]
364
365
366 def get_element_by_id(id, html, **kwargs):
367 """Return the content of the tag with the specified ID in the passed HTML document"""
368 return get_element_by_attribute('id', id, html, **kwargs)
369
370
371 def get_element_html_by_id(id, html, **kwargs):
372 """Return the html of the tag with the specified ID in the passed HTML document"""
373 return get_element_html_by_attribute('id', id, html, **kwargs)
374
375
376 def get_element_by_class(class_name, html):
377 """Return the content of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
382 def get_element_html_by_class(class_name, html):
383 """Return the html of the first tag with the specified class in the passed HTML document"""
384 retval = get_elements_html_by_class(class_name, html)
385 return retval[0] if retval else None
386
387
388 def get_element_by_attribute(attribute, value, html, **kwargs):
389 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
390 return retval[0] if retval else None
391
392
393 def get_element_html_by_attribute(attribute, value, html, **kargs):
394 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
395 return retval[0] if retval else None
396
397
398 def get_elements_by_class(class_name, html, **kargs):
399 """Return the content of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_by_attribute(
401 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
402 html, escape_value=False)
403
404
405 def get_elements_html_by_class(class_name, html):
406 """Return the html of all tags with the specified class in the passed HTML document as a list"""
407 return get_elements_html_by_attribute(
408 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
409 html, escape_value=False)
410
411
412 def get_elements_by_attribute(*args, **kwargs):
413 """Return the content of the tag with the specified attribute in the passed HTML document"""
414 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
415
416
417 def get_elements_html_by_attribute(*args, **kwargs):
418 """Return the html of the tag with the specified attribute in the passed HTML document"""
419 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
420
421
422 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
423 """
424 Return the text (content) and the html (whole) of the tag with the specified
425 attribute in the passed HTML document
426 """
427
428 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
429
430 value = re.escape(value) if escape_value else value
431
432 partial_element_re = rf'''(?x)
433 <(?P<tag>[a-zA-Z0-9:._-]+)
434 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
435 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
436 '''
437
438 for m in re.finditer(partial_element_re, html):
439 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
440
441 yield (
442 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
443 whole
444 )
445
446
447 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
448 """
449 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
450 closing tag for the first opening tag it has encountered, and can be used
451 as a context manager
452 """
453
454 class HTMLBreakOnClosingTagException(Exception):
455 pass
456
457 def __init__(self):
458 self.tagstack = collections.deque()
459 compat_HTMLParser.__init__(self)
460
461 def __enter__(self):
462 return self
463
464 def __exit__(self, *_):
465 self.close()
466
467 def close(self):
468 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
469 # so data remains buffered; we no longer have any interest in it, thus
470 # override this method to discard it
471 pass
472
473 def handle_starttag(self, tag, _):
474 self.tagstack.append(tag)
475
476 def handle_endtag(self, tag):
477 if not self.tagstack:
478 raise compat_HTMLParseError('no tags in the stack')
479 while self.tagstack:
480 inner_tag = self.tagstack.pop()
481 if inner_tag == tag:
482 break
483 else:
484 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
485 if not self.tagstack:
486 raise self.HTMLBreakOnClosingTagException()
487
488
489 def get_element_text_and_html_by_tag(tag, html):
490 """
491 For the first element with the specified tag in the passed HTML document
492 return its' content (text) and the whole element (html)
493 """
494 def find_or_raise(haystack, needle, exc):
495 try:
496 return haystack.index(needle)
497 except ValueError:
498 raise exc
499 closing_tag = f'</{tag}>'
500 whole_start = find_or_raise(
501 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
502 content_start = find_or_raise(
503 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
504 content_start += whole_start + 1
505 with HTMLBreakOnClosingTagParser() as parser:
506 parser.feed(html[whole_start:content_start])
507 if not parser.tagstack or parser.tagstack[0] != tag:
508 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
509 offset = content_start
510 while offset < len(html):
511 next_closing_tag_start = find_or_raise(
512 html[offset:], closing_tag,
513 compat_HTMLParseError(f'closing {tag} tag not found'))
514 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
515 try:
516 parser.feed(html[offset:offset + next_closing_tag_end])
517 offset += next_closing_tag_end
518 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
519 return html[content_start:offset + next_closing_tag_start], \
520 html[whole_start:offset + next_closing_tag_end]
521 raise compat_HTMLParseError('unexpected end of html')
522
523
524 class HTMLAttributeParser(compat_HTMLParser):
525 """Trivial HTML parser to gather the attributes for a single element"""
526
527 def __init__(self):
528 self.attrs = {}
529 compat_HTMLParser.__init__(self)
530
531 def handle_starttag(self, tag, attrs):
532 self.attrs = dict(attrs)
533
534
535 class HTMLListAttrsParser(compat_HTMLParser):
536 """HTML parser to gather the attributes for the elements of a list"""
537
538 def __init__(self):
539 compat_HTMLParser.__init__(self)
540 self.items = []
541 self._level = 0
542
543 def handle_starttag(self, tag, attrs):
544 if tag == 'li' and self._level == 0:
545 self.items.append(dict(attrs))
546 self._level += 1
547
548 def handle_endtag(self, tag):
549 self._level -= 1
550
551
552 def extract_attributes(html_element):
553 """Given a string for an HTML element such as
554 <el
555 a="foo" B="bar" c="&98;az" d=boz
556 empty= noval entity="&amp;"
557 sq='"' dq="'"
558 >
559 Decode and return a dictionary of attributes.
560 {
561 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
562 'empty': '', 'noval': None, 'entity': '&',
563 'sq': '"', 'dq': '\''
564 }.
565 """
566 parser = HTMLAttributeParser()
567 with contextlib.suppress(compat_HTMLParseError):
568 parser.feed(html_element)
569 parser.close()
570 return parser.attrs
571
572
573 def parse_list(webpage):
574 """Given a string for an series of HTML <li> elements,
575 return a dictionary of their attributes"""
576 parser = HTMLListAttrsParser()
577 parser.feed(webpage)
578 parser.close()
579 return parser.items
580
581
582 def clean_html(html):
583 """Clean an HTML snippet into a readable string"""
584
585 if html is None: # Convenience for sanitizing descriptions etc.
586 return html
587
588 html = re.sub(r'\s+', ' ', html)
589 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
590 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
591 # Strip html tags
592 html = re.sub('<.*?>', '', html)
593 # Replace html entities
594 html = unescapeHTML(html)
595 return html.strip()
596
597
598 class LenientJSONDecoder(json.JSONDecoder):
599 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
600 self.transform_source, self.ignore_extra = transform_source, ignore_extra
601 super().__init__(*args, **kwargs)
602
603 def decode(self, s):
604 if self.transform_source:
605 s = self.transform_source(s)
606 if self.ignore_extra:
607 return self.raw_decode(s.lstrip())[0]
608 return super().decode(s)
609
610
611 def sanitize_open(filename, open_mode):
612 """Try to open the given filename, and slightly tweak it if this fails.
613
614 Attempts to open the given filename. If this fails, it tries to change
615 the filename slightly, step by step, until it's either able to open it
616 or it fails and raises a final exception, like the standard open()
617 function.
618
619 It returns the tuple (stream, definitive_file_name).
620 """
621 if filename == '-':
622 if sys.platform == 'win32':
623 import msvcrt
624 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
625 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
626
627 for attempt in range(2):
628 try:
629 try:
630 if sys.platform == 'win32':
631 # FIXME: An exclusive lock also locks the file from being read.
632 # Since windows locks are mandatory, don't lock the file on windows (for now).
633 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
634 raise LockingUnsupportedError()
635 stream = locked_file(filename, open_mode, block=False).__enter__()
636 except OSError:
637 stream = open(filename, open_mode)
638 return stream, filename
639 except OSError as err:
640 if attempt or err.errno in (errno.EACCES,):
641 raise
642 old_filename, filename = filename, sanitize_path(filename)
643 if old_filename == filename:
644 raise
645
646
647 def timeconvert(timestr):
648 """Convert RFC 2822 defined time string into system timestamp"""
649 timestamp = None
650 timetuple = email.utils.parsedate_tz(timestr)
651 if timetuple is not None:
652 timestamp = email.utils.mktime_tz(timetuple)
653 return timestamp
654
655
656 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
657 """Sanitizes a string so it could be used as part of a filename.
658 @param restricted Use a stricter subset of allowed characters
659 @param is_id Whether this is an ID that should be kept unchanged if possible.
660 If unset, yt-dlp's new sanitization rules are in effect
661 """
662 if s == '':
663 return ''
664
665 def replace_insane(char):
666 if restricted and char in ACCENT_CHARS:
667 return ACCENT_CHARS[char]
668 elif not restricted and char == '\n':
669 return '\0 '
670 elif char == '?' or ord(char) < 32 or ord(char) == 127:
671 return ''
672 elif char == '"':
673 return '' if restricted else '\''
674 elif char == ':':
675 return '\0_\0-' if restricted else '\0 \0-'
676 elif char in '\\/|*<>':
677 return '\0_'
678 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
679 return '\0_'
680 return char
681
682 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
683 result = ''.join(map(replace_insane, s))
684 if is_id is NO_DEFAULT:
685 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
686 STRIP_RE = '(?:\0.|[ _-])*'
687 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
688 result = result.replace('\0', '') or '_'
689
690 if not is_id:
691 while '__' in result:
692 result = result.replace('__', '_')
693 result = result.strip('_')
694 # Common case of "Foreign band name - English song title"
695 if restricted and result.startswith('-_'):
696 result = result[2:]
697 if result.startswith('-'):
698 result = '_' + result[len('-'):]
699 result = result.lstrip('.')
700 if not result:
701 result = '_'
702 return result
703
704
705 def sanitize_path(s, force=False):
706 """Sanitizes and normalizes path on Windows"""
707 if sys.platform == 'win32':
708 force = False
709 drive_or_unc, _ = os.path.splitdrive(s)
710 elif force:
711 drive_or_unc = ''
712 else:
713 return s
714
715 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
716 if drive_or_unc:
717 norm_path.pop(0)
718 sanitized_path = [
719 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
720 for path_part in norm_path]
721 if drive_or_unc:
722 sanitized_path.insert(0, drive_or_unc + os.path.sep)
723 elif force and s and s[0] == os.path.sep:
724 sanitized_path.insert(0, os.path.sep)
725 return os.path.join(*sanitized_path)
726
727
728 def sanitize_url(url):
729 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
730 # the number of unwanted failures due to missing protocol
731 if url is None:
732 return
733 elif url.startswith('//'):
734 return 'http:%s' % url
735 # Fix some common typos seen so far
736 COMMON_TYPOS = (
737 # https://github.com/ytdl-org/youtube-dl/issues/15649
738 (r'^httpss://', r'https://'),
739 # https://bx1.be/lives/direct-tv/
740 (r'^rmtp([es]?)://', r'rtmp\1://'),
741 )
742 for mistake, fixup in COMMON_TYPOS:
743 if re.match(mistake, url):
744 return re.sub(mistake, fixup, url)
745 return url
746
747
748 def extract_basic_auth(url):
749 parts = compat_urlparse.urlsplit(url)
750 if parts.username is None:
751 return url, None
752 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
753 parts.hostname if parts.port is None
754 else '%s:%d' % (parts.hostname, parts.port))))
755 auth_payload = base64.b64encode(
756 ('%s:%s' % (parts.username, parts.password or '')).encode())
757 return url, f'Basic {auth_payload.decode()}'
758
759
760 def sanitized_Request(url, *args, **kwargs):
761 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
762 if auth_header is not None:
763 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
764 headers['Authorization'] = auth_header
765 return compat_urllib_request.Request(url, *args, **kwargs)
766
767
768 def expand_path(s):
769 """Expand shell variables and ~"""
770 return os.path.expandvars(compat_expanduser(s))
771
772
773 def orderedSet(iterable, *, lazy=False):
774 """Remove all duplicates from the input iterable"""
775 def _iter():
776 seen = [] # Do not use set since the items can be unhashable
777 for x in iterable:
778 if x not in seen:
779 seen.append(x)
780 yield x
781
782 return _iter() if lazy else list(_iter())
783
784
785 def _htmlentity_transform(entity_with_semicolon):
786 """Transforms an HTML entity to a character."""
787 entity = entity_with_semicolon[:-1]
788
789 # Known non-numeric HTML entity
790 if entity in compat_html_entities.name2codepoint:
791 return compat_chr(compat_html_entities.name2codepoint[entity])
792
793 # TODO: HTML5 allows entities without a semicolon. For example,
794 # '&Eacuteric' should be decoded as 'Éric'.
795 if entity_with_semicolon in compat_html_entities_html5:
796 return compat_html_entities_html5[entity_with_semicolon]
797
798 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
799 if mobj is not None:
800 numstr = mobj.group(1)
801 if numstr.startswith('x'):
802 base = 16
803 numstr = '0%s' % numstr
804 else:
805 base = 10
806 # See https://github.com/ytdl-org/youtube-dl/issues/7518
807 with contextlib.suppress(ValueError):
808 return compat_chr(int(numstr, base))
809
810 # Unknown entity in name, return its literal representation
811 return '&%s;' % entity
812
813
814 def unescapeHTML(s):
815 if s is None:
816 return None
817 assert isinstance(s, str)
818
819 return re.sub(
820 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
821
822
823 def escapeHTML(text):
824 return (
825 text
826 .replace('&', '&amp;')
827 .replace('<', '&lt;')
828 .replace('>', '&gt;')
829 .replace('"', '&quot;')
830 .replace("'", '&#39;')
831 )
832
833
834 def process_communicate_or_kill(p, *args, **kwargs):
835 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
836 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
837 return Popen.communicate_or_kill(p, *args, **kwargs)
838
839
840 class Popen(subprocess.Popen):
841 if sys.platform == 'win32':
842 _startupinfo = subprocess.STARTUPINFO()
843 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
844 else:
845 _startupinfo = None
846
847 def __init__(self, *args, text=False, **kwargs):
848 if text is True:
849 kwargs['universal_newlines'] = True # For 3.6 compatibility
850 kwargs.setdefault('encoding', 'utf-8')
851 kwargs.setdefault('errors', 'replace')
852 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
853
854 def communicate_or_kill(self, *args, **kwargs):
855 try:
856 return self.communicate(*args, **kwargs)
857 except BaseException: # Including KeyboardInterrupt
858 self.kill(timeout=None)
859 raise
860
861 def kill(self, *, timeout=0):
862 super().kill()
863 if timeout != 0:
864 self.wait(timeout=timeout)
865
866 @classmethod
867 def run(cls, *args, **kwargs):
868 with cls(*args, **kwargs) as proc:
869 stdout, stderr = proc.communicate_or_kill()
870 return stdout or '', stderr or '', proc.returncode
871
872
873 def get_subprocess_encoding():
874 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
875 # For subprocess calls, encode with locale encoding
876 # Refer to http://stackoverflow.com/a/9951851/35070
877 encoding = preferredencoding()
878 else:
879 encoding = sys.getfilesystemencoding()
880 if encoding is None:
881 encoding = 'utf-8'
882 return encoding
883
884
885 def encodeFilename(s, for_subprocess=False):
886 assert isinstance(s, str)
887 return s
888
889
890 def decodeFilename(b, for_subprocess=False):
891 return b
892
893
894 def encodeArgument(s):
895 # Legacy code that uses byte strings
896 # Uncomment the following line after fixing all post processors
897 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
898 return s if isinstance(s, str) else s.decode('ascii')
899
900
901 def decodeArgument(b):
902 return b
903
904
905 def decodeOption(optval):
906 if optval is None:
907 return optval
908 if isinstance(optval, bytes):
909 optval = optval.decode(preferredencoding())
910
911 assert isinstance(optval, compat_str)
912 return optval
913
914
915 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
916
917
918 def timetuple_from_msec(msec):
919 secs, msec = divmod(msec, 1000)
920 mins, secs = divmod(secs, 60)
921 hrs, mins = divmod(mins, 60)
922 return _timetuple(hrs, mins, secs, msec)
923
924
925 def formatSeconds(secs, delim=':', msec=False):
926 time = timetuple_from_msec(secs * 1000)
927 if time.hours:
928 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
929 elif time.minutes:
930 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
931 else:
932 ret = '%d' % time.seconds
933 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
934
935
936 def _ssl_load_windows_store_certs(ssl_context, storename):
937 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
938 try:
939 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
940 if encoding == 'x509_asn' and (
941 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
942 except PermissionError:
943 return
944 for cert in certs:
945 with contextlib.suppress(ssl.SSLError):
946 ssl_context.load_verify_locations(cadata=cert)
947
948
949 def make_HTTPS_handler(params, **kwargs):
950 opts_check_certificate = not params.get('nocheckcertificate')
951 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
952 context.check_hostname = opts_check_certificate
953 if params.get('legacyserverconnect'):
954 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
955 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
956 context.set_ciphers('DEFAULT')
957
958 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
959 if opts_check_certificate:
960 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
961 context.load_verify_locations(cafile=certifi.where())
962 try:
963 context.load_default_certs()
964 # Work around the issue in load_default_certs when there are bad certificates. See:
965 # https://github.com/yt-dlp/yt-dlp/issues/1060,
966 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
967 except ssl.SSLError:
968 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
969 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
970 for storename in ('CA', 'ROOT'):
971 _ssl_load_windows_store_certs(context, storename)
972 context.set_default_verify_paths()
973
974 client_certfile = params.get('client_certificate')
975 if client_certfile:
976 try:
977 context.load_cert_chain(
978 client_certfile, keyfile=params.get('client_certificate_key'),
979 password=params.get('client_certificate_password'))
980 except ssl.SSLError:
981 raise YoutubeDLError('Unable to load client certificate')
982
983 # Some servers may reject requests if ALPN extension is not sent. See:
984 # https://github.com/python/cpython/issues/85140
985 # https://github.com/yt-dlp/yt-dlp/issues/3878
986 with contextlib.suppress(NotImplementedError):
987 context.set_alpn_protocols(['http/1.1'])
988
989 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
990
991
992 def bug_reports_message(before=';'):
993 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
994 'filling out the appropriate issue template. '
995 'Confirm you are on the latest version using yt-dlp -U')
996
997 before = before.rstrip()
998 if not before or before.endswith(('.', '!', '?')):
999 msg = msg[0].title() + msg[1:]
1000
1001 return (before + ' ' if before else '') + msg
1002
1003
1004 class YoutubeDLError(Exception):
1005 """Base exception for YoutubeDL errors."""
1006 msg = None
1007
1008 def __init__(self, msg=None):
1009 if msg is not None:
1010 self.msg = msg
1011 elif self.msg is None:
1012 self.msg = type(self).__name__
1013 super().__init__(self.msg)
1014
1015
1016 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1017 if hasattr(ssl, 'CertificateError'):
1018 network_exceptions.append(ssl.CertificateError)
1019 network_exceptions = tuple(network_exceptions)
1020
1021
1022 class ExtractorError(YoutubeDLError):
1023 """Error during info extraction."""
1024
1025 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1026 """ tb, if given, is the original traceback (so that it can be printed out).
1027 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1028 """
1029 if sys.exc_info()[0] in network_exceptions:
1030 expected = True
1031
1032 self.orig_msg = str(msg)
1033 self.traceback = tb
1034 self.expected = expected
1035 self.cause = cause
1036 self.video_id = video_id
1037 self.ie = ie
1038 self.exc_info = sys.exc_info() # preserve original exception
1039
1040 super().__init__(''.join((
1041 format_field(ie, None, '[%s] '),
1042 format_field(video_id, None, '%s: '),
1043 msg,
1044 format_field(cause, None, ' (caused by %r)'),
1045 '' if expected else bug_reports_message())))
1046
1047 def format_traceback(self):
1048 return join_nonempty(
1049 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1050 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1051 delim='\n') or None
1052
1053
1054 class UnsupportedError(ExtractorError):
1055 def __init__(self, url):
1056 super().__init__(
1057 'Unsupported URL: %s' % url, expected=True)
1058 self.url = url
1059
1060
1061 class RegexNotFoundError(ExtractorError):
1062 """Error when a regex didn't match"""
1063 pass
1064
1065
1066 class GeoRestrictedError(ExtractorError):
1067 """Geographic restriction Error exception.
1068
1069 This exception may be thrown when a video is not available from your
1070 geographic location due to geographic restrictions imposed by a website.
1071 """
1072
1073 def __init__(self, msg, countries=None, **kwargs):
1074 kwargs['expected'] = True
1075 super().__init__(msg, **kwargs)
1076 self.countries = countries
1077
1078
1079 class DownloadError(YoutubeDLError):
1080 """Download Error exception.
1081
1082 This exception may be thrown by FileDownloader objects if they are not
1083 configured to continue on errors. They will contain the appropriate
1084 error message.
1085 """
1086
1087 def __init__(self, msg, exc_info=None):
1088 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1089 super().__init__(msg)
1090 self.exc_info = exc_info
1091
1092
1093 class EntryNotInPlaylist(YoutubeDLError):
1094 """Entry not in playlist exception.
1095
1096 This exception will be thrown by YoutubeDL when a requested entry
1097 is not found in the playlist info_dict
1098 """
1099 msg = 'Entry not found in info'
1100
1101
1102 class SameFileError(YoutubeDLError):
1103 """Same File exception.
1104
1105 This exception will be thrown by FileDownloader objects if they detect
1106 multiple files would have to be downloaded to the same file on disk.
1107 """
1108 msg = 'Fixed output name but more than one file to download'
1109
1110 def __init__(self, filename=None):
1111 if filename is not None:
1112 self.msg += f': {filename}'
1113 super().__init__(self.msg)
1114
1115
1116 class PostProcessingError(YoutubeDLError):
1117 """Post Processing exception.
1118
1119 This exception may be raised by PostProcessor's .run() method to
1120 indicate an error in the postprocessing task.
1121 """
1122
1123
1124 class DownloadCancelled(YoutubeDLError):
1125 """ Exception raised when the download queue should be interrupted """
1126 msg = 'The download was cancelled'
1127
1128
1129 class ExistingVideoReached(DownloadCancelled):
1130 """ --break-on-existing triggered """
1131 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1132
1133
1134 class RejectedVideoReached(DownloadCancelled):
1135 """ --break-on-reject triggered """
1136 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1137
1138
1139 class MaxDownloadsReached(DownloadCancelled):
1140 """ --max-downloads limit has been reached. """
1141 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1142
1143
1144 class ReExtractInfo(YoutubeDLError):
1145 """ Video info needs to be re-extracted. """
1146
1147 def __init__(self, msg, expected=False):
1148 super().__init__(msg)
1149 self.expected = expected
1150
1151
1152 class ThrottledDownload(ReExtractInfo):
1153 """ Download speed below --throttled-rate. """
1154 msg = 'The download speed is below throttle limit'
1155
1156 def __init__(self):
1157 super().__init__(self.msg, expected=False)
1158
1159
1160 class UnavailableVideoError(YoutubeDLError):
1161 """Unavailable Format exception.
1162
1163 This exception will be thrown when a video is requested
1164 in a format that is not available for that video.
1165 """
1166 msg = 'Unable to download video'
1167
1168 def __init__(self, err=None):
1169 if err is not None:
1170 self.msg += f': {err}'
1171 super().__init__(self.msg)
1172
1173
1174 class ContentTooShortError(YoutubeDLError):
1175 """Content Too Short exception.
1176
1177 This exception may be raised by FileDownloader objects when a file they
1178 download is too small for what the server announced first, indicating
1179 the connection was probably interrupted.
1180 """
1181
1182 def __init__(self, downloaded, expected):
1183 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1184 # Both in bytes
1185 self.downloaded = downloaded
1186 self.expected = expected
1187
1188
1189 class XAttrMetadataError(YoutubeDLError):
1190 def __init__(self, code=None, msg='Unknown error'):
1191 super().__init__(msg)
1192 self.code = code
1193 self.msg = msg
1194
1195 # Parsing code and msg
1196 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1197 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1198 self.reason = 'NO_SPACE'
1199 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1200 self.reason = 'VALUE_TOO_LONG'
1201 else:
1202 self.reason = 'NOT_SUPPORTED'
1203
1204
1205 class XAttrUnavailableError(YoutubeDLError):
1206 pass
1207
1208
1209 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1210 hc = http_class(*args, **kwargs)
1211 source_address = ydl_handler._params.get('source_address')
1212
1213 if source_address is not None:
1214 # This is to workaround _create_connection() from socket where it will try all
1215 # address data from getaddrinfo() including IPv6. This filters the result from
1216 # getaddrinfo() based on the source_address value.
1217 # This is based on the cpython socket.create_connection() function.
1218 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1219 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1220 host, port = address
1221 err = None
1222 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1223 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1224 ip_addrs = [addr for addr in addrs if addr[0] == af]
1225 if addrs and not ip_addrs:
1226 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1227 raise OSError(
1228 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1229 % (ip_version, source_address[0]))
1230 for res in ip_addrs:
1231 af, socktype, proto, canonname, sa = res
1232 sock = None
1233 try:
1234 sock = socket.socket(af, socktype, proto)
1235 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1236 sock.settimeout(timeout)
1237 sock.bind(source_address)
1238 sock.connect(sa)
1239 err = None # Explicitly break reference cycle
1240 return sock
1241 except OSError as _:
1242 err = _
1243 if sock is not None:
1244 sock.close()
1245 if err is not None:
1246 raise err
1247 else:
1248 raise OSError('getaddrinfo returns an empty list')
1249 if hasattr(hc, '_create_connection'):
1250 hc._create_connection = _create_connection
1251 hc.source_address = (source_address, 0)
1252
1253 return hc
1254
1255
1256 def handle_youtubedl_headers(headers):
1257 filtered_headers = headers
1258
1259 if 'Youtubedl-no-compression' in filtered_headers:
1260 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1261 del filtered_headers['Youtubedl-no-compression']
1262
1263 return filtered_headers
1264
1265
1266 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1267 """Handler for HTTP requests and responses.
1268
1269 This class, when installed with an OpenerDirector, automatically adds
1270 the standard headers to every HTTP request and handles gzipped and
1271 deflated responses from web servers. If compression is to be avoided in
1272 a particular request, the original request in the program code only has
1273 to include the HTTP header "Youtubedl-no-compression", which will be
1274 removed before making the real request.
1275
1276 Part of this code was copied from:
1277
1278 http://techknack.net/python-urllib2-handlers/
1279
1280 Andrew Rowls, the author of that code, agreed to release it to the
1281 public domain.
1282 """
1283
1284 def __init__(self, params, *args, **kwargs):
1285 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1286 self._params = params
1287
1288 def http_open(self, req):
1289 conn_class = compat_http_client.HTTPConnection
1290
1291 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1292 if socks_proxy:
1293 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1294 del req.headers['Ytdl-socks-proxy']
1295
1296 return self.do_open(functools.partial(
1297 _create_http_connection, self, conn_class, False),
1298 req)
1299
1300 @staticmethod
1301 def deflate(data):
1302 if not data:
1303 return data
1304 try:
1305 return zlib.decompress(data, -zlib.MAX_WBITS)
1306 except zlib.error:
1307 return zlib.decompress(data)
1308
1309 @staticmethod
1310 def brotli(data):
1311 if not data:
1312 return data
1313 return brotli.decompress(data)
1314
1315 def http_request(self, req):
1316 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1317 # always respected by websites, some tend to give out URLs with non percent-encoded
1318 # non-ASCII characters (see telemb.py, ard.py [#3412])
1319 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1320 # To work around aforementioned issue we will replace request's original URL with
1321 # percent-encoded one
1322 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1323 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1324 url = req.get_full_url()
1325 url_escaped = escape_url(url)
1326
1327 # Substitute URL if any change after escaping
1328 if url != url_escaped:
1329 req = update_Request(req, url=url_escaped)
1330
1331 for h, v in self._params.get('http_headers', std_headers).items():
1332 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1333 # The dict keys are capitalized because of this bug by urllib
1334 if h.capitalize() not in req.headers:
1335 req.add_header(h, v)
1336
1337 if 'Accept-encoding' not in req.headers:
1338 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1339
1340 req.headers = handle_youtubedl_headers(req.headers)
1341
1342 return req
1343
1344 def http_response(self, req, resp):
1345 old_resp = resp
1346 # gzip
1347 if resp.headers.get('Content-encoding', '') == 'gzip':
1348 content = resp.read()
1349 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1350 try:
1351 uncompressed = io.BytesIO(gz.read())
1352 except OSError as original_ioerror:
1353 # There may be junk add the end of the file
1354 # See http://stackoverflow.com/q/4928560/35070 for details
1355 for i in range(1, 1024):
1356 try:
1357 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1358 uncompressed = io.BytesIO(gz.read())
1359 except OSError:
1360 continue
1361 break
1362 else:
1363 raise original_ioerror
1364 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1365 resp.msg = old_resp.msg
1366 del resp.headers['Content-encoding']
1367 # deflate
1368 if resp.headers.get('Content-encoding', '') == 'deflate':
1369 gz = io.BytesIO(self.deflate(resp.read()))
1370 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1371 resp.msg = old_resp.msg
1372 del resp.headers['Content-encoding']
1373 # brotli
1374 if resp.headers.get('Content-encoding', '') == 'br':
1375 resp = compat_urllib_request.addinfourl(
1376 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1377 resp.msg = old_resp.msg
1378 del resp.headers['Content-encoding']
1379 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1380 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1381 if 300 <= resp.code < 400:
1382 location = resp.headers.get('Location')
1383 if location:
1384 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1385 location = location.encode('iso-8859-1').decode()
1386 location_escaped = escape_url(location)
1387 if location != location_escaped:
1388 del resp.headers['Location']
1389 resp.headers['Location'] = location_escaped
1390 return resp
1391
1392 https_request = http_request
1393 https_response = http_response
1394
1395
1396 def make_socks_conn_class(base_class, socks_proxy):
1397 assert issubclass(base_class, (
1398 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1399
1400 url_components = compat_urlparse.urlparse(socks_proxy)
1401 if url_components.scheme.lower() == 'socks5':
1402 socks_type = ProxyType.SOCKS5
1403 elif url_components.scheme.lower() in ('socks', 'socks4'):
1404 socks_type = ProxyType.SOCKS4
1405 elif url_components.scheme.lower() == 'socks4a':
1406 socks_type = ProxyType.SOCKS4A
1407
1408 def unquote_if_non_empty(s):
1409 if not s:
1410 return s
1411 return compat_urllib_parse_unquote_plus(s)
1412
1413 proxy_args = (
1414 socks_type,
1415 url_components.hostname, url_components.port or 1080,
1416 True, # Remote DNS
1417 unquote_if_non_empty(url_components.username),
1418 unquote_if_non_empty(url_components.password),
1419 )
1420
1421 class SocksConnection(base_class):
1422 def connect(self):
1423 self.sock = sockssocket()
1424 self.sock.setproxy(*proxy_args)
1425 if isinstance(self.timeout, (int, float)):
1426 self.sock.settimeout(self.timeout)
1427 self.sock.connect((self.host, self.port))
1428
1429 if isinstance(self, compat_http_client.HTTPSConnection):
1430 if hasattr(self, '_context'): # Python > 2.6
1431 self.sock = self._context.wrap_socket(
1432 self.sock, server_hostname=self.host)
1433 else:
1434 self.sock = ssl.wrap_socket(self.sock)
1435
1436 return SocksConnection
1437
1438
1439 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1440 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1441 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1442 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1443 self._params = params
1444
1445 def https_open(self, req):
1446 kwargs = {}
1447 conn_class = self._https_conn_class
1448
1449 if hasattr(self, '_context'): # python > 2.6
1450 kwargs['context'] = self._context
1451 if hasattr(self, '_check_hostname'): # python 3.x
1452 kwargs['check_hostname'] = self._check_hostname
1453
1454 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1455 if socks_proxy:
1456 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1457 del req.headers['Ytdl-socks-proxy']
1458
1459 try:
1460 return self.do_open(
1461 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1462 except urllib.error.URLError as e:
1463 if (isinstance(e.reason, ssl.SSLError)
1464 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1465 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1466 raise
1467
1468
1469 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1470 """
1471 See [1] for cookie file format.
1472
1473 1. https://curl.haxx.se/docs/http-cookies.html
1474 """
1475 _HTTPONLY_PREFIX = '#HttpOnly_'
1476 _ENTRY_LEN = 7
1477 _HEADER = '''# Netscape HTTP Cookie File
1478 # This file is generated by yt-dlp. Do not edit.
1479
1480 '''
1481 _CookieFileEntry = collections.namedtuple(
1482 'CookieFileEntry',
1483 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1484
1485 def __init__(self, filename=None, *args, **kwargs):
1486 super().__init__(None, *args, **kwargs)
1487 if self.is_path(filename):
1488 filename = os.fspath(filename)
1489 self.filename = filename
1490
1491 @staticmethod
1492 def _true_or_false(cndn):
1493 return 'TRUE' if cndn else 'FALSE'
1494
1495 @staticmethod
1496 def is_path(file):
1497 return isinstance(file, (str, bytes, os.PathLike))
1498
1499 @contextlib.contextmanager
1500 def open(self, file, *, write=False):
1501 if self.is_path(file):
1502 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1503 yield f
1504 else:
1505 if write:
1506 file.truncate(0)
1507 yield file
1508
1509 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1510 now = time.time()
1511 for cookie in self:
1512 if (not ignore_discard and cookie.discard
1513 or not ignore_expires and cookie.is_expired(now)):
1514 continue
1515 name, value = cookie.name, cookie.value
1516 if value is None:
1517 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1518 # with no name, whereas http.cookiejar regards it as a
1519 # cookie with no value.
1520 name, value = '', name
1521 f.write('%s\n' % '\t'.join((
1522 cookie.domain,
1523 self._true_or_false(cookie.domain.startswith('.')),
1524 cookie.path,
1525 self._true_or_false(cookie.secure),
1526 str_or_none(cookie.expires, default=''),
1527 name, value
1528 )))
1529
1530 def save(self, filename=None, *args, **kwargs):
1531 """
1532 Save cookies to a file.
1533 Code is taken from CPython 3.6
1534 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1535
1536 if filename is None:
1537 if self.filename is not None:
1538 filename = self.filename
1539 else:
1540 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1541
1542 # Store session cookies with `expires` set to 0 instead of an empty string
1543 for cookie in self:
1544 if cookie.expires is None:
1545 cookie.expires = 0
1546
1547 with self.open(filename, write=True) as f:
1548 f.write(self._HEADER)
1549 self._really_save(f, *args, **kwargs)
1550
1551 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1552 """Load cookies from a file."""
1553 if filename is None:
1554 if self.filename is not None:
1555 filename = self.filename
1556 else:
1557 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1558
1559 def prepare_line(line):
1560 if line.startswith(self._HTTPONLY_PREFIX):
1561 line = line[len(self._HTTPONLY_PREFIX):]
1562 # comments and empty lines are fine
1563 if line.startswith('#') or not line.strip():
1564 return line
1565 cookie_list = line.split('\t')
1566 if len(cookie_list) != self._ENTRY_LEN:
1567 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1568 cookie = self._CookieFileEntry(*cookie_list)
1569 if cookie.expires_at and not cookie.expires_at.isdigit():
1570 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1571 return line
1572
1573 cf = io.StringIO()
1574 with self.open(filename) as f:
1575 for line in f:
1576 try:
1577 cf.write(prepare_line(line))
1578 except compat_cookiejar.LoadError as e:
1579 if f'{line.strip()} '[0] in '[{"':
1580 raise compat_cookiejar.LoadError(
1581 'Cookies file must be Netscape formatted, not JSON. See '
1582 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1583 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1584 continue
1585 cf.seek(0)
1586 self._really_load(cf, filename, ignore_discard, ignore_expires)
1587 # Session cookies are denoted by either `expires` field set to
1588 # an empty string or 0. MozillaCookieJar only recognizes the former
1589 # (see [1]). So we need force the latter to be recognized as session
1590 # cookies on our own.
1591 # Session cookies may be important for cookies-based authentication,
1592 # e.g. usually, when user does not check 'Remember me' check box while
1593 # logging in on a site, some important cookies are stored as session
1594 # cookies so that not recognizing them will result in failed login.
1595 # 1. https://bugs.python.org/issue17164
1596 for cookie in self:
1597 # Treat `expires=0` cookies as session cookies
1598 if cookie.expires == 0:
1599 cookie.expires = None
1600 cookie.discard = True
1601
1602
1603 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1604 def __init__(self, cookiejar=None):
1605 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1606
1607 def http_response(self, request, response):
1608 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1609
1610 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1611 https_response = http_response
1612
1613
1614 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1615 """YoutubeDL redirect handler
1616
1617 The code is based on HTTPRedirectHandler implementation from CPython [1].
1618
1619 This redirect handler solves two issues:
1620 - ensures redirect URL is always unicode under python 2
1621 - introduces support for experimental HTTP response status code
1622 308 Permanent Redirect [2] used by some sites [3]
1623
1624 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1625 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1626 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1627 """
1628
1629 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1630
1631 def redirect_request(self, req, fp, code, msg, headers, newurl):
1632 """Return a Request or None in response to a redirect.
1633
1634 This is called by the http_error_30x methods when a
1635 redirection response is received. If a redirection should
1636 take place, return a new Request to allow http_error_30x to
1637 perform the redirect. Otherwise, raise HTTPError if no-one
1638 else should try to handle this url. Return None if you can't
1639 but another Handler might.
1640 """
1641 m = req.get_method()
1642 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1643 or code in (301, 302, 303) and m == "POST")):
1644 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1645 # Strictly (according to RFC 2616), 301 or 302 in response to
1646 # a POST MUST NOT cause a redirection without confirmation
1647 # from the user (of urllib.request, in this case). In practice,
1648 # essentially all clients do redirect in this case, so we do
1649 # the same.
1650
1651 # Be conciliant with URIs containing a space. This is mainly
1652 # redundant with the more complete encoding done in http_error_302(),
1653 # but it is kept for compatibility with other callers.
1654 newurl = newurl.replace(' ', '%20')
1655
1656 CONTENT_HEADERS = ("content-length", "content-type")
1657 # NB: don't use dict comprehension for python 2.6 compatibility
1658 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1659
1660 # A 303 must either use GET or HEAD for subsequent request
1661 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1662 if code == 303 and m != 'HEAD':
1663 m = 'GET'
1664 # 301 and 302 redirects are commonly turned into a GET from a POST
1665 # for subsequent requests by browsers, so we'll do the same.
1666 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1667 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1668 if code in (301, 302) and m == 'POST':
1669 m = 'GET'
1670
1671 return compat_urllib_request.Request(
1672 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1673 unverifiable=True, method=m)
1674
1675
1676 def extract_timezone(date_str):
1677 m = re.search(
1678 r'''(?x)
1679 ^.{8,}? # >=8 char non-TZ prefix, if present
1680 (?P<tz>Z| # just the UTC Z, or
1681 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1682 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1683 [ ]? # optional space
1684 (?P<sign>\+|-) # +/-
1685 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1686 $)
1687 ''', date_str)
1688 if not m:
1689 timezone = datetime.timedelta()
1690 else:
1691 date_str = date_str[:-len(m.group('tz'))]
1692 if not m.group('sign'):
1693 timezone = datetime.timedelta()
1694 else:
1695 sign = 1 if m.group('sign') == '+' else -1
1696 timezone = datetime.timedelta(
1697 hours=sign * int(m.group('hours')),
1698 minutes=sign * int(m.group('minutes')))
1699 return timezone, date_str
1700
1701
1702 def parse_iso8601(date_str, delimiter='T', timezone=None):
1703 """ Return a UNIX timestamp from the given date """
1704
1705 if date_str is None:
1706 return None
1707
1708 date_str = re.sub(r'\.[0-9]+', '', date_str)
1709
1710 if timezone is None:
1711 timezone, date_str = extract_timezone(date_str)
1712
1713 with contextlib.suppress(ValueError):
1714 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1715 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1716 return calendar.timegm(dt.timetuple())
1717
1718
1719 def date_formats(day_first=True):
1720 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1721
1722
1723 def unified_strdate(date_str, day_first=True):
1724 """Return a string with the date in the format YYYYMMDD"""
1725
1726 if date_str is None:
1727 return None
1728 upload_date = None
1729 # Replace commas
1730 date_str = date_str.replace(',', ' ')
1731 # Remove AM/PM + timezone
1732 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1733 _, date_str = extract_timezone(date_str)
1734
1735 for expression in date_formats(day_first):
1736 with contextlib.suppress(ValueError):
1737 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1738 if upload_date is None:
1739 timetuple = email.utils.parsedate_tz(date_str)
1740 if timetuple:
1741 with contextlib.suppress(ValueError):
1742 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1743 if upload_date is not None:
1744 return compat_str(upload_date)
1745
1746
1747 def unified_timestamp(date_str, day_first=True):
1748 if date_str is None:
1749 return None
1750
1751 date_str = re.sub(r'[,|]', '', date_str)
1752
1753 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1754 timezone, date_str = extract_timezone(date_str)
1755
1756 # Remove AM/PM + timezone
1757 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1758
1759 # Remove unrecognized timezones from ISO 8601 alike timestamps
1760 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1761 if m:
1762 date_str = date_str[:-len(m.group('tz'))]
1763
1764 # Python only supports microseconds, so remove nanoseconds
1765 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1766 if m:
1767 date_str = m.group(1)
1768
1769 for expression in date_formats(day_first):
1770 with contextlib.suppress(ValueError):
1771 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1772 return calendar.timegm(dt.timetuple())
1773 timetuple = email.utils.parsedate_tz(date_str)
1774 if timetuple:
1775 return calendar.timegm(timetuple) + pm_delta * 3600
1776
1777
1778 def determine_ext(url, default_ext='unknown_video'):
1779 if url is None or '.' not in url:
1780 return default_ext
1781 guess = url.partition('?')[0].rpartition('.')[2]
1782 if re.match(r'^[A-Za-z0-9]+$', guess):
1783 return guess
1784 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1785 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1786 return guess.rstrip('/')
1787 else:
1788 return default_ext
1789
1790
1791 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1792 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1793
1794
1795 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1796 R"""
1797 Return a datetime object from a string.
1798 Supported format:
1799 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1800
1801 @param format strftime format of DATE
1802 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1803 auto: round to the unit provided in date_str (if applicable).
1804 """
1805 auto_precision = False
1806 if precision == 'auto':
1807 auto_precision = True
1808 precision = 'microsecond'
1809 today = datetime_round(datetime.datetime.utcnow(), precision)
1810 if date_str in ('now', 'today'):
1811 return today
1812 if date_str == 'yesterday':
1813 return today - datetime.timedelta(days=1)
1814 match = re.match(
1815 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1816 date_str)
1817 if match is not None:
1818 start_time = datetime_from_str(match.group('start'), precision, format)
1819 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1820 unit = match.group('unit')
1821 if unit == 'month' or unit == 'year':
1822 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1823 unit = 'day'
1824 else:
1825 if unit == 'week':
1826 unit = 'day'
1827 time *= 7
1828 delta = datetime.timedelta(**{unit + 's': time})
1829 new_date = start_time + delta
1830 if auto_precision:
1831 return datetime_round(new_date, unit)
1832 return new_date
1833
1834 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1835
1836
1837 def date_from_str(date_str, format='%Y%m%d', strict=False):
1838 R"""
1839 Return a date object from a string using datetime_from_str
1840
1841 @param strict Restrict allowed patterns to "YYYYMMDD" and
1842 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1843 """
1844 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1845 raise ValueError(f'Invalid date format "{date_str}"')
1846 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1847
1848
1849 def datetime_add_months(dt, months):
1850 """Increment/Decrement a datetime object by months."""
1851 month = dt.month + months - 1
1852 year = dt.year + month // 12
1853 month = month % 12 + 1
1854 day = min(dt.day, calendar.monthrange(year, month)[1])
1855 return dt.replace(year, month, day)
1856
1857
1858 def datetime_round(dt, precision='day'):
1859 """
1860 Round a datetime object's time to a specific precision
1861 """
1862 if precision == 'microsecond':
1863 return dt
1864
1865 unit_seconds = {
1866 'day': 86400,
1867 'hour': 3600,
1868 'minute': 60,
1869 'second': 1,
1870 }
1871 roundto = lambda x, n: ((x + n / 2) // n) * n
1872 timestamp = calendar.timegm(dt.timetuple())
1873 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1874
1875
1876 def hyphenate_date(date_str):
1877 """
1878 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1879 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1880 if match is not None:
1881 return '-'.join(match.groups())
1882 else:
1883 return date_str
1884
1885
1886 class DateRange:
1887 """Represents a time interval between two dates"""
1888
1889 def __init__(self, start=None, end=None):
1890 """start and end must be strings in the format accepted by date"""
1891 if start is not None:
1892 self.start = date_from_str(start, strict=True)
1893 else:
1894 self.start = datetime.datetime.min.date()
1895 if end is not None:
1896 self.end = date_from_str(end, strict=True)
1897 else:
1898 self.end = datetime.datetime.max.date()
1899 if self.start > self.end:
1900 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1901
1902 @classmethod
1903 def day(cls, day):
1904 """Returns a range that only contains the given day"""
1905 return cls(day, day)
1906
1907 def __contains__(self, date):
1908 """Check if the date is in the range"""
1909 if not isinstance(date, datetime.date):
1910 date = date_from_str(date)
1911 return self.start <= date <= self.end
1912
1913 def __str__(self):
1914 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1915
1916
1917 def platform_name():
1918 """ Returns the platform name as a compat_str """
1919 res = platform.platform()
1920 if isinstance(res, bytes):
1921 res = res.decode(preferredencoding())
1922
1923 assert isinstance(res, compat_str)
1924 return res
1925
1926
1927 @functools.cache
1928 def get_windows_version():
1929 ''' Get Windows version. returns () if it's not running on Windows '''
1930 if compat_os_name == 'nt':
1931 return version_tuple(platform.win32_ver()[1])
1932 else:
1933 return ()
1934
1935
1936 def write_string(s, out=None, encoding=None):
1937 assert isinstance(s, str)
1938 out = out or sys.stderr
1939
1940 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1941 s = re.sub(r'([\r\n]+)', r' \1', s)
1942
1943 enc, buffer = None, out
1944 if 'b' in getattr(out, 'mode', ''):
1945 enc = encoding or preferredencoding()
1946 elif hasattr(out, 'buffer'):
1947 buffer = out.buffer
1948 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1949
1950 buffer.write(s.encode(enc, 'ignore') if enc else s)
1951 out.flush()
1952
1953
1954 def bytes_to_intlist(bs):
1955 if not bs:
1956 return []
1957 if isinstance(bs[0], int): # Python 3
1958 return list(bs)
1959 else:
1960 return [ord(c) for c in bs]
1961
1962
1963 def intlist_to_bytes(xs):
1964 if not xs:
1965 return b''
1966 return compat_struct_pack('%dB' % len(xs), *xs)
1967
1968
1969 class LockingUnsupportedError(OSError):
1970 msg = 'File locking is not supported'
1971
1972 def __init__(self):
1973 super().__init__(self.msg)
1974
1975
1976 # Cross-platform file locking
1977 if sys.platform == 'win32':
1978 import ctypes.wintypes
1979 import msvcrt
1980
1981 class OVERLAPPED(ctypes.Structure):
1982 _fields_ = [
1983 ('Internal', ctypes.wintypes.LPVOID),
1984 ('InternalHigh', ctypes.wintypes.LPVOID),
1985 ('Offset', ctypes.wintypes.DWORD),
1986 ('OffsetHigh', ctypes.wintypes.DWORD),
1987 ('hEvent', ctypes.wintypes.HANDLE),
1988 ]
1989
1990 kernel32 = ctypes.windll.kernel32
1991 LockFileEx = kernel32.LockFileEx
1992 LockFileEx.argtypes = [
1993 ctypes.wintypes.HANDLE, # hFile
1994 ctypes.wintypes.DWORD, # dwFlags
1995 ctypes.wintypes.DWORD, # dwReserved
1996 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1997 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1998 ctypes.POINTER(OVERLAPPED) # Overlapped
1999 ]
2000 LockFileEx.restype = ctypes.wintypes.BOOL
2001 UnlockFileEx = kernel32.UnlockFileEx
2002 UnlockFileEx.argtypes = [
2003 ctypes.wintypes.HANDLE, # hFile
2004 ctypes.wintypes.DWORD, # dwReserved
2005 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2006 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2007 ctypes.POINTER(OVERLAPPED) # Overlapped
2008 ]
2009 UnlockFileEx.restype = ctypes.wintypes.BOOL
2010 whole_low = 0xffffffff
2011 whole_high = 0x7fffffff
2012
2013 def _lock_file(f, exclusive, block):
2014 overlapped = OVERLAPPED()
2015 overlapped.Offset = 0
2016 overlapped.OffsetHigh = 0
2017 overlapped.hEvent = 0
2018 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2019
2020 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2021 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2022 0, whole_low, whole_high, f._lock_file_overlapped_p):
2023 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2024 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2025
2026 def _unlock_file(f):
2027 assert f._lock_file_overlapped_p
2028 handle = msvcrt.get_osfhandle(f.fileno())
2029 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2030 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2031
2032 else:
2033 try:
2034 import fcntl
2035
2036 def _lock_file(f, exclusive, block):
2037 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2038 if not block:
2039 flags |= fcntl.LOCK_NB
2040 try:
2041 fcntl.flock(f, flags)
2042 except BlockingIOError:
2043 raise
2044 except OSError: # AOSP does not have flock()
2045 fcntl.lockf(f, flags)
2046
2047 def _unlock_file(f):
2048 try:
2049 fcntl.flock(f, fcntl.LOCK_UN)
2050 except OSError:
2051 fcntl.lockf(f, fcntl.LOCK_UN)
2052
2053 except ImportError:
2054
2055 def _lock_file(f, exclusive, block):
2056 raise LockingUnsupportedError()
2057
2058 def _unlock_file(f):
2059 raise LockingUnsupportedError()
2060
2061
2062 class locked_file:
2063 locked = False
2064
2065 def __init__(self, filename, mode, block=True, encoding=None):
2066 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2067 raise NotImplementedError(mode)
2068 self.mode, self.block = mode, block
2069
2070 writable = any(f in mode for f in 'wax+')
2071 readable = any(f in mode for f in 'r+')
2072 flags = functools.reduce(operator.ior, (
2073 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2074 getattr(os, 'O_BINARY', 0), # Windows only
2075 getattr(os, 'O_NOINHERIT', 0), # Windows only
2076 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2077 os.O_APPEND if 'a' in mode else 0,
2078 os.O_EXCL if 'x' in mode else 0,
2079 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2080 ))
2081
2082 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2083
2084 def __enter__(self):
2085 exclusive = 'r' not in self.mode
2086 try:
2087 _lock_file(self.f, exclusive, self.block)
2088 self.locked = True
2089 except OSError:
2090 self.f.close()
2091 raise
2092 if 'w' in self.mode:
2093 try:
2094 self.f.truncate()
2095 except OSError as e:
2096 if e.errno not in (
2097 errno.ESPIPE, # Illegal seek - expected for FIFO
2098 errno.EINVAL, # Invalid argument - expected for /dev/null
2099 ):
2100 raise
2101 return self
2102
2103 def unlock(self):
2104 if not self.locked:
2105 return
2106 try:
2107 _unlock_file(self.f)
2108 finally:
2109 self.locked = False
2110
2111 def __exit__(self, *_):
2112 try:
2113 self.unlock()
2114 finally:
2115 self.f.close()
2116
2117 open = __enter__
2118 close = __exit__
2119
2120 def __getattr__(self, attr):
2121 return getattr(self.f, attr)
2122
2123 def __iter__(self):
2124 return iter(self.f)
2125
2126
2127 @functools.cache
2128 def get_filesystem_encoding():
2129 encoding = sys.getfilesystemencoding()
2130 return encoding if encoding is not None else 'utf-8'
2131
2132
2133 def shell_quote(args):
2134 quoted_args = []
2135 encoding = get_filesystem_encoding()
2136 for a in args:
2137 if isinstance(a, bytes):
2138 # We may get a filename encoded with 'encodeFilename'
2139 a = a.decode(encoding)
2140 quoted_args.append(compat_shlex_quote(a))
2141 return ' '.join(quoted_args)
2142
2143
2144 def smuggle_url(url, data):
2145 """ Pass additional data in a URL for internal use. """
2146
2147 url, idata = unsmuggle_url(url, {})
2148 data.update(idata)
2149 sdata = compat_urllib_parse_urlencode(
2150 {'__youtubedl_smuggle': json.dumps(data)})
2151 return url + '#' + sdata
2152
2153
2154 def unsmuggle_url(smug_url, default=None):
2155 if '#__youtubedl_smuggle' not in smug_url:
2156 return smug_url, default
2157 url, _, sdata = smug_url.rpartition('#')
2158 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2159 data = json.loads(jsond)
2160 return url, data
2161
2162
2163 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2164 """ Formats numbers with decimal sufixes like K, M, etc """
2165 num, factor = float_or_none(num), float(factor)
2166 if num is None or num < 0:
2167 return None
2168 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2169 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2170 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2171 if factor == 1024:
2172 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2173 converted = num / (factor ** exponent)
2174 return fmt % (converted, suffix)
2175
2176
2177 def format_bytes(bytes):
2178 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2179
2180
2181 def lookup_unit_table(unit_table, s):
2182 units_re = '|'.join(re.escape(u) for u in unit_table)
2183 m = re.match(
2184 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2185 if not m:
2186 return None
2187 num_str = m.group('num').replace(',', '.')
2188 mult = unit_table[m.group('unit')]
2189 return int(float(num_str) * mult)
2190
2191
2192 def parse_filesize(s):
2193 if s is None:
2194 return None
2195
2196 # The lower-case forms are of course incorrect and unofficial,
2197 # but we support those too
2198 _UNIT_TABLE = {
2199 'B': 1,
2200 'b': 1,
2201 'bytes': 1,
2202 'KiB': 1024,
2203 'KB': 1000,
2204 'kB': 1024,
2205 'Kb': 1000,
2206 'kb': 1000,
2207 'kilobytes': 1000,
2208 'kibibytes': 1024,
2209 'MiB': 1024 ** 2,
2210 'MB': 1000 ** 2,
2211 'mB': 1024 ** 2,
2212 'Mb': 1000 ** 2,
2213 'mb': 1000 ** 2,
2214 'megabytes': 1000 ** 2,
2215 'mebibytes': 1024 ** 2,
2216 'GiB': 1024 ** 3,
2217 'GB': 1000 ** 3,
2218 'gB': 1024 ** 3,
2219 'Gb': 1000 ** 3,
2220 'gb': 1000 ** 3,
2221 'gigabytes': 1000 ** 3,
2222 'gibibytes': 1024 ** 3,
2223 'TiB': 1024 ** 4,
2224 'TB': 1000 ** 4,
2225 'tB': 1024 ** 4,
2226 'Tb': 1000 ** 4,
2227 'tb': 1000 ** 4,
2228 'terabytes': 1000 ** 4,
2229 'tebibytes': 1024 ** 4,
2230 'PiB': 1024 ** 5,
2231 'PB': 1000 ** 5,
2232 'pB': 1024 ** 5,
2233 'Pb': 1000 ** 5,
2234 'pb': 1000 ** 5,
2235 'petabytes': 1000 ** 5,
2236 'pebibytes': 1024 ** 5,
2237 'EiB': 1024 ** 6,
2238 'EB': 1000 ** 6,
2239 'eB': 1024 ** 6,
2240 'Eb': 1000 ** 6,
2241 'eb': 1000 ** 6,
2242 'exabytes': 1000 ** 6,
2243 'exbibytes': 1024 ** 6,
2244 'ZiB': 1024 ** 7,
2245 'ZB': 1000 ** 7,
2246 'zB': 1024 ** 7,
2247 'Zb': 1000 ** 7,
2248 'zb': 1000 ** 7,
2249 'zettabytes': 1000 ** 7,
2250 'zebibytes': 1024 ** 7,
2251 'YiB': 1024 ** 8,
2252 'YB': 1000 ** 8,
2253 'yB': 1024 ** 8,
2254 'Yb': 1000 ** 8,
2255 'yb': 1000 ** 8,
2256 'yottabytes': 1000 ** 8,
2257 'yobibytes': 1024 ** 8,
2258 }
2259
2260 return lookup_unit_table(_UNIT_TABLE, s)
2261
2262
2263 def parse_count(s):
2264 if s is None:
2265 return None
2266
2267 s = re.sub(r'^[^\d]+\s', '', s).strip()
2268
2269 if re.match(r'^[\d,.]+$', s):
2270 return str_to_int(s)
2271
2272 _UNIT_TABLE = {
2273 'k': 1000,
2274 'K': 1000,
2275 'm': 1000 ** 2,
2276 'M': 1000 ** 2,
2277 'kk': 1000 ** 2,
2278 'KK': 1000 ** 2,
2279 'b': 1000 ** 3,
2280 'B': 1000 ** 3,
2281 }
2282
2283 ret = lookup_unit_table(_UNIT_TABLE, s)
2284 if ret is not None:
2285 return ret
2286
2287 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2288 if mobj:
2289 return str_to_int(mobj.group(1))
2290
2291
2292 def parse_resolution(s, *, lenient=False):
2293 if s is None:
2294 return {}
2295
2296 if lenient:
2297 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2298 else:
2299 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2300 if mobj:
2301 return {
2302 'width': int(mobj.group('w')),
2303 'height': int(mobj.group('h')),
2304 }
2305
2306 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2307 if mobj:
2308 return {'height': int(mobj.group(1))}
2309
2310 mobj = re.search(r'\b([48])[kK]\b', s)
2311 if mobj:
2312 return {'height': int(mobj.group(1)) * 540}
2313
2314 return {}
2315
2316
2317 def parse_bitrate(s):
2318 if not isinstance(s, compat_str):
2319 return
2320 mobj = re.search(r'\b(\d+)\s*kbps', s)
2321 if mobj:
2322 return int(mobj.group(1))
2323
2324
2325 def month_by_name(name, lang='en'):
2326 """ Return the number of a month by (locale-independently) English name """
2327
2328 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2329
2330 try:
2331 return month_names.index(name) + 1
2332 except ValueError:
2333 return None
2334
2335
2336 def month_by_abbreviation(abbrev):
2337 """ Return the number of a month by (locale-independently) English
2338 abbreviations """
2339
2340 try:
2341 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2342 except ValueError:
2343 return None
2344
2345
2346 def fix_xml_ampersands(xml_str):
2347 """Replace all the '&' by '&amp;' in XML"""
2348 return re.sub(
2349 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2350 '&amp;',
2351 xml_str)
2352
2353
2354 def setproctitle(title):
2355 assert isinstance(title, compat_str)
2356
2357 # ctypes in Jython is not complete
2358 # http://bugs.jython.org/issue2148
2359 if sys.platform.startswith('java'):
2360 return
2361
2362 try:
2363 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2364 except OSError:
2365 return
2366 except TypeError:
2367 # LoadLibrary in Windows Python 2.7.13 only expects
2368 # a bytestring, but since unicode_literals turns
2369 # every string into a unicode string, it fails.
2370 return
2371 title_bytes = title.encode()
2372 buf = ctypes.create_string_buffer(len(title_bytes))
2373 buf.value = title_bytes
2374 try:
2375 libc.prctl(15, buf, 0, 0, 0)
2376 except AttributeError:
2377 return # Strange libc, just skip this
2378
2379
2380 def remove_start(s, start):
2381 return s[len(start):] if s is not None and s.startswith(start) else s
2382
2383
2384 def remove_end(s, end):
2385 return s[:-len(end)] if s is not None and s.endswith(end) else s
2386
2387
2388 def remove_quotes(s):
2389 if s is None or len(s) < 2:
2390 return s
2391 for quote in ('"', "'", ):
2392 if s[0] == quote and s[-1] == quote:
2393 return s[1:-1]
2394 return s
2395
2396
2397 def get_domain(url):
2398 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2399 return domain.group('domain') if domain else None
2400
2401
2402 def url_basename(url):
2403 path = compat_urlparse.urlparse(url).path
2404 return path.strip('/').split('/')[-1]
2405
2406
2407 def base_url(url):
2408 return re.match(r'https?://[^?#&]+/', url).group()
2409
2410
2411 def urljoin(base, path):
2412 if isinstance(path, bytes):
2413 path = path.decode()
2414 if not isinstance(path, compat_str) or not path:
2415 return None
2416 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2417 return path
2418 if isinstance(base, bytes):
2419 base = base.decode()
2420 if not isinstance(base, compat_str) or not re.match(
2421 r'^(?:https?:)?//', base):
2422 return None
2423 return compat_urlparse.urljoin(base, path)
2424
2425
2426 class HEADRequest(compat_urllib_request.Request):
2427 def get_method(self):
2428 return 'HEAD'
2429
2430
2431 class PUTRequest(compat_urllib_request.Request):
2432 def get_method(self):
2433 return 'PUT'
2434
2435
2436 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2437 if get_attr and v is not None:
2438 v = getattr(v, get_attr, None)
2439 try:
2440 return int(v) * invscale // scale
2441 except (ValueError, TypeError, OverflowError):
2442 return default
2443
2444
2445 def str_or_none(v, default=None):
2446 return default if v is None else compat_str(v)
2447
2448
2449 def str_to_int(int_str):
2450 """ A more relaxed version of int_or_none """
2451 if isinstance(int_str, int):
2452 return int_str
2453 elif isinstance(int_str, compat_str):
2454 int_str = re.sub(r'[,\.\+]', '', int_str)
2455 return int_or_none(int_str)
2456
2457
2458 def float_or_none(v, scale=1, invscale=1, default=None):
2459 if v is None:
2460 return default
2461 try:
2462 return float(v) * invscale / scale
2463 except (ValueError, TypeError):
2464 return default
2465
2466
2467 def bool_or_none(v, default=None):
2468 return v if isinstance(v, bool) else default
2469
2470
2471 def strip_or_none(v, default=None):
2472 return v.strip() if isinstance(v, compat_str) else default
2473
2474
2475 def url_or_none(url):
2476 if not url or not isinstance(url, compat_str):
2477 return None
2478 url = url.strip()
2479 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2480
2481
2482 def request_to_url(req):
2483 if isinstance(req, compat_urllib_request.Request):
2484 return req.get_full_url()
2485 else:
2486 return req
2487
2488
2489 def strftime_or_none(timestamp, date_format, default=None):
2490 datetime_object = None
2491 try:
2492 if isinstance(timestamp, (int, float)): # unix timestamp
2493 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2494 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2495 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2496 return datetime_object.strftime(date_format)
2497 except (ValueError, TypeError, AttributeError):
2498 return default
2499
2500
2501 def parse_duration(s):
2502 if not isinstance(s, str):
2503 return None
2504 s = s.strip()
2505 if not s:
2506 return None
2507
2508 days, hours, mins, secs, ms = [None] * 5
2509 m = re.match(r'''(?x)
2510 (?P<before_secs>
2511 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2512 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2513 (?P<ms>[.:][0-9]+)?Z?$
2514 ''', s)
2515 if m:
2516 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2517 else:
2518 m = re.match(
2519 r'''(?ix)(?:P?
2520 (?:
2521 [0-9]+\s*y(?:ears?)?,?\s*
2522 )?
2523 (?:
2524 [0-9]+\s*m(?:onths?)?,?\s*
2525 )?
2526 (?:
2527 [0-9]+\s*w(?:eeks?)?,?\s*
2528 )?
2529 (?:
2530 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2531 )?
2532 T)?
2533 (?:
2534 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2535 )?
2536 (?:
2537 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2538 )?
2539 (?:
2540 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2541 )?Z?$''', s)
2542 if m:
2543 days, hours, mins, secs, ms = m.groups()
2544 else:
2545 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2546 if m:
2547 hours, mins = m.groups()
2548 else:
2549 return None
2550
2551 if ms:
2552 ms = ms.replace(':', '.')
2553 return sum(float(part or 0) * mult for part, mult in (
2554 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2555
2556
2557 def prepend_extension(filename, ext, expected_real_ext=None):
2558 name, real_ext = os.path.splitext(filename)
2559 return (
2560 f'{name}.{ext}{real_ext}'
2561 if not expected_real_ext or real_ext[1:] == expected_real_ext
2562 else f'{filename}.{ext}')
2563
2564
2565 def replace_extension(filename, ext, expected_real_ext=None):
2566 name, real_ext = os.path.splitext(filename)
2567 return '{}.{}'.format(
2568 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2569 ext)
2570
2571
2572 def check_executable(exe, args=[]):
2573 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2574 args can be a list of arguments for a short output (like -version) """
2575 try:
2576 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2577 except OSError:
2578 return False
2579 return exe
2580
2581
2582 def _get_exe_version_output(exe, args, *, to_screen=None):
2583 if to_screen:
2584 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2585 try:
2586 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2587 # SIGTTOU if yt-dlp is run in the background.
2588 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2589 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2590 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2591 except OSError:
2592 return False
2593 return stdout
2594
2595
2596 def detect_exe_version(output, version_re=None, unrecognized='present'):
2597 assert isinstance(output, compat_str)
2598 if version_re is None:
2599 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2600 m = re.search(version_re, output)
2601 if m:
2602 return m.group(1)
2603 else:
2604 return unrecognized
2605
2606
2607 def get_exe_version(exe, args=['--version'],
2608 version_re=None, unrecognized='present'):
2609 """ Returns the version of the specified executable,
2610 or False if the executable is not present """
2611 out = _get_exe_version_output(exe, args)
2612 return detect_exe_version(out, version_re, unrecognized) if out else False
2613
2614
2615 def frange(start=0, stop=None, step=1):
2616 """Float range"""
2617 if stop is None:
2618 start, stop = 0, start
2619 sign = [-1, 1][step > 0] if step else 0
2620 while sign * start < sign * stop:
2621 yield start
2622 start += step
2623
2624
2625 class LazyList(collections.abc.Sequence):
2626 """Lazy immutable list from an iterable
2627 Note that slices of a LazyList are lists and not LazyList"""
2628
2629 class IndexError(IndexError):
2630 pass
2631
2632 def __init__(self, iterable, *, reverse=False, _cache=None):
2633 self._iterable = iter(iterable)
2634 self._cache = [] if _cache is None else _cache
2635 self._reversed = reverse
2636
2637 def __iter__(self):
2638 if self._reversed:
2639 # We need to consume the entire iterable to iterate in reverse
2640 yield from self.exhaust()
2641 return
2642 yield from self._cache
2643 for item in self._iterable:
2644 self._cache.append(item)
2645 yield item
2646
2647 def _exhaust(self):
2648 self._cache.extend(self._iterable)
2649 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2650 return self._cache
2651
2652 def exhaust(self):
2653 """Evaluate the entire iterable"""
2654 return self._exhaust()[::-1 if self._reversed else 1]
2655
2656 @staticmethod
2657 def _reverse_index(x):
2658 return None if x is None else -(x + 1)
2659
2660 def __getitem__(self, idx):
2661 if isinstance(idx, slice):
2662 if self._reversed:
2663 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2664 start, stop, step = idx.start, idx.stop, idx.step or 1
2665 elif isinstance(idx, int):
2666 if self._reversed:
2667 idx = self._reverse_index(idx)
2668 start, stop, step = idx, idx, 0
2669 else:
2670 raise TypeError('indices must be integers or slices')
2671 if ((start or 0) < 0 or (stop or 0) < 0
2672 or (start is None and step < 0)
2673 or (stop is None and step > 0)):
2674 # We need to consume the entire iterable to be able to slice from the end
2675 # Obviously, never use this with infinite iterables
2676 self._exhaust()
2677 try:
2678 return self._cache[idx]
2679 except IndexError as e:
2680 raise self.IndexError(e) from e
2681 n = max(start or 0, stop or 0) - len(self._cache) + 1
2682 if n > 0:
2683 self._cache.extend(itertools.islice(self._iterable, n))
2684 try:
2685 return self._cache[idx]
2686 except IndexError as e:
2687 raise self.IndexError(e) from e
2688
2689 def __bool__(self):
2690 try:
2691 self[-1] if self._reversed else self[0]
2692 except self.IndexError:
2693 return False
2694 return True
2695
2696 def __len__(self):
2697 self._exhaust()
2698 return len(self._cache)
2699
2700 def __reversed__(self):
2701 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2702
2703 def __copy__(self):
2704 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2705
2706 def __repr__(self):
2707 # repr and str should mimic a list. So we exhaust the iterable
2708 return repr(self.exhaust())
2709
2710 def __str__(self):
2711 return repr(self.exhaust())
2712
2713
2714 class PagedList:
2715
2716 class IndexError(IndexError):
2717 pass
2718
2719 def __len__(self):
2720 # This is only useful for tests
2721 return len(self.getslice())
2722
2723 def __init__(self, pagefunc, pagesize, use_cache=True):
2724 self._pagefunc = pagefunc
2725 self._pagesize = pagesize
2726 self._pagecount = float('inf')
2727 self._use_cache = use_cache
2728 self._cache = {}
2729
2730 def getpage(self, pagenum):
2731 page_results = self._cache.get(pagenum)
2732 if page_results is None:
2733 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2734 if self._use_cache:
2735 self._cache[pagenum] = page_results
2736 return page_results
2737
2738 def getslice(self, start=0, end=None):
2739 return list(self._getslice(start, end))
2740
2741 def _getslice(self, start, end):
2742 raise NotImplementedError('This method must be implemented by subclasses')
2743
2744 def __getitem__(self, idx):
2745 assert self._use_cache, 'Indexing PagedList requires cache'
2746 if not isinstance(idx, int) or idx < 0:
2747 raise TypeError('indices must be non-negative integers')
2748 entries = self.getslice(idx, idx + 1)
2749 if not entries:
2750 raise self.IndexError()
2751 return entries[0]
2752
2753
2754 class OnDemandPagedList(PagedList):
2755 """Download pages until a page with less than maximum results"""
2756
2757 def _getslice(self, start, end):
2758 for pagenum in itertools.count(start // self._pagesize):
2759 firstid = pagenum * self._pagesize
2760 nextfirstid = pagenum * self._pagesize + self._pagesize
2761 if start >= nextfirstid:
2762 continue
2763
2764 startv = (
2765 start % self._pagesize
2766 if firstid <= start < nextfirstid
2767 else 0)
2768 endv = (
2769 ((end - 1) % self._pagesize) + 1
2770 if (end is not None and firstid <= end <= nextfirstid)
2771 else None)
2772
2773 try:
2774 page_results = self.getpage(pagenum)
2775 except Exception:
2776 self._pagecount = pagenum - 1
2777 raise
2778 if startv != 0 or endv is not None:
2779 page_results = page_results[startv:endv]
2780 yield from page_results
2781
2782 # A little optimization - if current page is not "full", ie. does
2783 # not contain page_size videos then we can assume that this page
2784 # is the last one - there are no more ids on further pages -
2785 # i.e. no need to query again.
2786 if len(page_results) + startv < self._pagesize:
2787 break
2788
2789 # If we got the whole page, but the next page is not interesting,
2790 # break out early as well
2791 if end == nextfirstid:
2792 break
2793
2794
2795 class InAdvancePagedList(PagedList):
2796 """PagedList with total number of pages known in advance"""
2797
2798 def __init__(self, pagefunc, pagecount, pagesize):
2799 PagedList.__init__(self, pagefunc, pagesize, True)
2800 self._pagecount = pagecount
2801
2802 def _getslice(self, start, end):
2803 start_page = start // self._pagesize
2804 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2805 skip_elems = start - start_page * self._pagesize
2806 only_more = None if end is None else end - start
2807 for pagenum in range(start_page, end_page):
2808 page_results = self.getpage(pagenum)
2809 if skip_elems:
2810 page_results = page_results[skip_elems:]
2811 skip_elems = None
2812 if only_more is not None:
2813 if len(page_results) < only_more:
2814 only_more -= len(page_results)
2815 else:
2816 yield from page_results[:only_more]
2817 break
2818 yield from page_results
2819
2820
2821 class PlaylistEntries:
2822 MissingEntry = object()
2823 is_exhausted = False
2824
2825 def __init__(self, ydl, info_dict):
2826 self.ydl = ydl
2827
2828 # _entries must be assigned now since infodict can change during iteration
2829 entries = info_dict.get('entries')
2830 if entries is None:
2831 raise EntryNotInPlaylist('There are no entries')
2832 elif isinstance(entries, list):
2833 self.is_exhausted = True
2834
2835 requested_entries = info_dict.get('requested_entries')
2836 self.is_incomplete = bool(requested_entries)
2837 if self.is_incomplete:
2838 assert self.is_exhausted
2839 self._entries = [self.MissingEntry] * max(requested_entries)
2840 for i, entry in zip(requested_entries, entries):
2841 self._entries[i - 1] = entry
2842 elif isinstance(entries, (list, PagedList, LazyList)):
2843 self._entries = entries
2844 else:
2845 self._entries = LazyList(entries)
2846
2847 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2848 (?P<start>[+-]?\d+)?
2849 (?P<range>[:-]
2850 (?P<end>[+-]?\d+|inf(?:inite)?)?
2851 (?::(?P<step>[+-]?\d+))?
2852 )?''')
2853
2854 @classmethod
2855 def parse_playlist_items(cls, string):
2856 for segment in string.split(','):
2857 if not segment:
2858 raise ValueError('There is two or more consecutive commas')
2859 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2860 if not mobj:
2861 raise ValueError(f'{segment!r} is not a valid specification')
2862 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2863 if int_or_none(step) == 0:
2864 raise ValueError(f'Step in {segment!r} cannot be zero')
2865 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2866
2867 def get_requested_items(self):
2868 playlist_items = self.ydl.params.get('playlist_items')
2869 playlist_start = self.ydl.params.get('playliststart', 1)
2870 playlist_end = self.ydl.params.get('playlistend')
2871 # For backwards compatibility, interpret -1 as whole list
2872 if playlist_end in (-1, None):
2873 playlist_end = ''
2874 if not playlist_items:
2875 playlist_items = f'{playlist_start}:{playlist_end}'
2876 elif playlist_start != 1 or playlist_end:
2877 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2878
2879 for index in self.parse_playlist_items(playlist_items):
2880 for i, entry in self[index]:
2881 yield i, entry
2882 try:
2883 # TODO: Add auto-generated fields
2884 self.ydl._match_entry(entry, incomplete=True, silent=True)
2885 except (ExistingVideoReached, RejectedVideoReached):
2886 return
2887
2888 def get_full_count(self):
2889 if self.is_exhausted and not self.is_incomplete:
2890 return len(self)
2891 elif isinstance(self._entries, InAdvancePagedList):
2892 if self._entries._pagesize == 1:
2893 return self._entries._pagecount
2894
2895 @functools.cached_property
2896 def _getter(self):
2897 if isinstance(self._entries, list):
2898 def get_entry(i):
2899 try:
2900 entry = self._entries[i]
2901 except IndexError:
2902 entry = self.MissingEntry
2903 if not self.is_incomplete:
2904 raise self.IndexError()
2905 if entry is self.MissingEntry:
2906 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2907 return entry
2908 else:
2909 def get_entry(i):
2910 try:
2911 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2912 except (LazyList.IndexError, PagedList.IndexError):
2913 raise self.IndexError()
2914 return get_entry
2915
2916 def __getitem__(self, idx):
2917 if isinstance(idx, int):
2918 idx = slice(idx, idx)
2919
2920 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2921 step = 1 if idx.step is None else idx.step
2922 if idx.start is None:
2923 start = 0 if step > 0 else len(self) - 1
2924 else:
2925 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2926
2927 # NB: Do not call len(self) when idx == [:]
2928 if idx.stop is None:
2929 stop = 0 if step < 0 else float('inf')
2930 else:
2931 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2932 stop += [-1, 1][step > 0]
2933
2934 for i in frange(start, stop, step):
2935 if i < 0:
2936 continue
2937 try:
2938 entry = self._getter(i)
2939 except self.IndexError:
2940 self.is_exhausted = True
2941 if step > 0:
2942 break
2943 continue
2944 yield i + 1, entry
2945
2946 def __len__(self):
2947 return len(tuple(self[:]))
2948
2949 class IndexError(IndexError):
2950 pass
2951
2952
2953 def uppercase_escape(s):
2954 unicode_escape = codecs.getdecoder('unicode_escape')
2955 return re.sub(
2956 r'\\U[0-9a-fA-F]{8}',
2957 lambda m: unicode_escape(m.group(0))[0],
2958 s)
2959
2960
2961 def lowercase_escape(s):
2962 unicode_escape = codecs.getdecoder('unicode_escape')
2963 return re.sub(
2964 r'\\u[0-9a-fA-F]{4}',
2965 lambda m: unicode_escape(m.group(0))[0],
2966 s)
2967
2968
2969 def escape_rfc3986(s):
2970 """Escape non-ASCII characters as suggested by RFC 3986"""
2971 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2972
2973
2974 def escape_url(url):
2975 """Escape URL as suggested by RFC 3986"""
2976 url_parsed = compat_urllib_parse_urlparse(url)
2977 return url_parsed._replace(
2978 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2979 path=escape_rfc3986(url_parsed.path),
2980 params=escape_rfc3986(url_parsed.params),
2981 query=escape_rfc3986(url_parsed.query),
2982 fragment=escape_rfc3986(url_parsed.fragment)
2983 ).geturl()
2984
2985
2986 def parse_qs(url):
2987 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2988
2989
2990 def read_batch_urls(batch_fd):
2991 def fixup(url):
2992 if not isinstance(url, compat_str):
2993 url = url.decode('utf-8', 'replace')
2994 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2995 for bom in BOM_UTF8:
2996 if url.startswith(bom):
2997 url = url[len(bom):]
2998 url = url.lstrip()
2999 if not url or url.startswith(('#', ';', ']')):
3000 return False
3001 # "#" cannot be stripped out since it is part of the URI
3002 # However, it can be safely stipped out if follwing a whitespace
3003 return re.split(r'\s#', url, 1)[0].rstrip()
3004
3005 with contextlib.closing(batch_fd) as fd:
3006 return [url for url in map(fixup, fd) if url]
3007
3008
3009 def urlencode_postdata(*args, **kargs):
3010 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
3011
3012
3013 def update_url_query(url, query):
3014 if not query:
3015 return url
3016 parsed_url = compat_urlparse.urlparse(url)
3017 qs = compat_parse_qs(parsed_url.query)
3018 qs.update(query)
3019 return compat_urlparse.urlunparse(parsed_url._replace(
3020 query=compat_urllib_parse_urlencode(qs, True)))
3021
3022
3023 def update_Request(req, url=None, data=None, headers={}, query={}):
3024 req_headers = req.headers.copy()
3025 req_headers.update(headers)
3026 req_data = data or req.data
3027 req_url = update_url_query(url or req.get_full_url(), query)
3028 req_get_method = req.get_method()
3029 if req_get_method == 'HEAD':
3030 req_type = HEADRequest
3031 elif req_get_method == 'PUT':
3032 req_type = PUTRequest
3033 else:
3034 req_type = compat_urllib_request.Request
3035 new_req = req_type(
3036 req_url, data=req_data, headers=req_headers,
3037 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3038 if hasattr(req, 'timeout'):
3039 new_req.timeout = req.timeout
3040 return new_req
3041
3042
3043 def _multipart_encode_impl(data, boundary):
3044 content_type = 'multipart/form-data; boundary=%s' % boundary
3045
3046 out = b''
3047 for k, v in data.items():
3048 out += b'--' + boundary.encode('ascii') + b'\r\n'
3049 if isinstance(k, compat_str):
3050 k = k.encode()
3051 if isinstance(v, compat_str):
3052 v = v.encode()
3053 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3054 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3055 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3056 if boundary.encode('ascii') in content:
3057 raise ValueError('Boundary overlaps with data')
3058 out += content
3059
3060 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3061
3062 return out, content_type
3063
3064
3065 def multipart_encode(data, boundary=None):
3066 '''
3067 Encode a dict to RFC 7578-compliant form-data
3068
3069 data:
3070 A dict where keys and values can be either Unicode or bytes-like
3071 objects.
3072 boundary:
3073 If specified a Unicode object, it's used as the boundary. Otherwise
3074 a random boundary is generated.
3075
3076 Reference: https://tools.ietf.org/html/rfc7578
3077 '''
3078 has_specified_boundary = boundary is not None
3079
3080 while True:
3081 if boundary is None:
3082 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3083
3084 try:
3085 out, content_type = _multipart_encode_impl(data, boundary)
3086 break
3087 except ValueError:
3088 if has_specified_boundary:
3089 raise
3090 boundary = None
3091
3092 return out, content_type
3093
3094
3095 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3096 for val in map(d.get, variadic(key_or_keys)):
3097 if val is not None and (val or not skip_false_values):
3098 return val
3099 return default
3100
3101
3102 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3103 for f in funcs:
3104 try:
3105 val = f(*args, **kwargs)
3106 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3107 pass
3108 else:
3109 if expected_type is None or isinstance(val, expected_type):
3110 return val
3111
3112
3113 def try_get(src, getter, expected_type=None):
3114 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3115
3116
3117 def filter_dict(dct, cndn=lambda _, v: v is not None):
3118 return {k: v for k, v in dct.items() if cndn(k, v)}
3119
3120
3121 def merge_dicts(*dicts):
3122 merged = {}
3123 for a_dict in dicts:
3124 for k, v in a_dict.items():
3125 if (v is not None and k not in merged
3126 or isinstance(v, str) and merged[k] == ''):
3127 merged[k] = v
3128 return merged
3129
3130
3131 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3132 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3133
3134
3135 US_RATINGS = {
3136 'G': 0,
3137 'PG': 10,
3138 'PG-13': 13,
3139 'R': 16,
3140 'NC': 18,
3141 }
3142
3143
3144 TV_PARENTAL_GUIDELINES = {
3145 'TV-Y': 0,
3146 'TV-Y7': 7,
3147 'TV-G': 0,
3148 'TV-PG': 0,
3149 'TV-14': 14,
3150 'TV-MA': 17,
3151 }
3152
3153
3154 def parse_age_limit(s):
3155 # isinstance(False, int) is True. So type() must be used instead
3156 if type(s) is int: # noqa: E721
3157 return s if 0 <= s <= 21 else None
3158 elif not isinstance(s, str):
3159 return None
3160 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3161 if m:
3162 return int(m.group('age'))
3163 s = s.upper()
3164 if s in US_RATINGS:
3165 return US_RATINGS[s]
3166 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3167 if m:
3168 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3169 return None
3170
3171
3172 def strip_jsonp(code):
3173 return re.sub(
3174 r'''(?sx)^
3175 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3176 (?:\s*&&\s*(?P=func_name))?
3177 \s*\(\s*(?P<callback_data>.*)\);?
3178 \s*?(?://[^\n]*)*$''',
3179 r'\g<callback_data>', code)
3180
3181
3182 def js_to_json(code, vars={}):
3183 # vars is a dict of var, val pairs to substitute
3184 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3185 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3186 INTEGER_TABLE = (
3187 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3188 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3189 )
3190
3191 def fix_kv(m):
3192 v = m.group(0)
3193 if v in ('true', 'false', 'null'):
3194 return v
3195 elif v in ('undefined', 'void 0'):
3196 return 'null'
3197 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3198 return ""
3199
3200 if v[0] in ("'", '"'):
3201 v = re.sub(r'(?s)\\.|"', lambda m: {
3202 '"': '\\"',
3203 "\\'": "'",
3204 '\\\n': '',
3205 '\\x': '\\u00',
3206 }.get(m.group(0), m.group(0)), v[1:-1])
3207 else:
3208 for regex, base in INTEGER_TABLE:
3209 im = re.match(regex, v)
3210 if im:
3211 i = int(im.group(1), base)
3212 return '"%d":' % i if v.endswith(':') else '%d' % i
3213
3214 if v in vars:
3215 return vars[v]
3216
3217 return '"%s"' % v
3218
3219 def create_map(mobj):
3220 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3221
3222 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3223 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3224
3225 return re.sub(r'''(?sx)
3226 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3227 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3228 {comment}|,(?={skip}[\]}}])|
3229 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3230 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3231 [0-9]+(?={skip}:)|
3232 !+
3233 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3234
3235
3236 def qualities(quality_ids):
3237 """ Get a numeric quality value out of a list of possible values """
3238 def q(qid):
3239 try:
3240 return quality_ids.index(qid)
3241 except ValueError:
3242 return -1
3243 return q
3244
3245
3246 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3247
3248
3249 DEFAULT_OUTTMPL = {
3250 'default': '%(title)s [%(id)s].%(ext)s',
3251 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3252 }
3253 OUTTMPL_TYPES = {
3254 'chapter': None,
3255 'subtitle': None,
3256 'thumbnail': None,
3257 'description': 'description',
3258 'annotation': 'annotations.xml',
3259 'infojson': 'info.json',
3260 'link': None,
3261 'pl_video': None,
3262 'pl_thumbnail': None,
3263 'pl_description': 'description',
3264 'pl_infojson': 'info.json',
3265 }
3266
3267 # As of [1] format syntax is:
3268 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3269 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3270 STR_FORMAT_RE_TMPL = r'''(?x)
3271 (?<!%)(?P<prefix>(?:%%)*)
3272 %
3273 (?P<has_key>\((?P<key>{0})\))?
3274 (?P<format>
3275 (?P<conversion>[#0\-+ ]+)?
3276 (?P<min_width>\d+)?
3277 (?P<precision>\.\d+)?
3278 (?P<len_mod>[hlL])? # unused in python
3279 {1} # conversion type
3280 )
3281 '''
3282
3283
3284 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3285
3286
3287 def limit_length(s, length):
3288 """ Add ellipses to overly long strings """
3289 if s is None:
3290 return None
3291 ELLIPSES = '...'
3292 if len(s) > length:
3293 return s[:length - len(ELLIPSES)] + ELLIPSES
3294 return s
3295
3296
3297 def version_tuple(v):
3298 return tuple(int(e) for e in re.split(r'[-.]', v))
3299
3300
3301 def is_outdated_version(version, limit, assume_new=True):
3302 if not version:
3303 return not assume_new
3304 try:
3305 return version_tuple(version) < version_tuple(limit)
3306 except ValueError:
3307 return not assume_new
3308
3309
3310 def ytdl_is_updateable():
3311 """ Returns if yt-dlp can be updated with -U """
3312
3313 from .update import is_non_updateable
3314
3315 return not is_non_updateable()
3316
3317
3318 def args_to_str(args):
3319 # Get a short string representation for a subprocess command
3320 return ' '.join(compat_shlex_quote(a) for a in args)
3321
3322
3323 def error_to_compat_str(err):
3324 return str(err)
3325
3326
3327 def error_to_str(err):
3328 return f'{type(err).__name__}: {err}'
3329
3330
3331 def mimetype2ext(mt):
3332 if mt is None:
3333 return None
3334
3335 mt, _, params = mt.partition(';')
3336 mt = mt.strip()
3337
3338 FULL_MAP = {
3339 'audio/mp4': 'm4a',
3340 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3341 # it's the most popular one
3342 'audio/mpeg': 'mp3',
3343 'audio/x-wav': 'wav',
3344 'audio/wav': 'wav',
3345 'audio/wave': 'wav',
3346 }
3347
3348 ext = FULL_MAP.get(mt)
3349 if ext is not None:
3350 return ext
3351
3352 SUBTYPE_MAP = {
3353 '3gpp': '3gp',
3354 'smptett+xml': 'tt',
3355 'ttaf+xml': 'dfxp',
3356 'ttml+xml': 'ttml',
3357 'x-flv': 'flv',
3358 'x-mp4-fragmented': 'mp4',
3359 'x-ms-sami': 'sami',
3360 'x-ms-wmv': 'wmv',
3361 'mpegurl': 'm3u8',
3362 'x-mpegurl': 'm3u8',
3363 'vnd.apple.mpegurl': 'm3u8',
3364 'dash+xml': 'mpd',
3365 'f4m+xml': 'f4m',
3366 'hds+xml': 'f4m',
3367 'vnd.ms-sstr+xml': 'ism',
3368 'quicktime': 'mov',
3369 'mp2t': 'ts',
3370 'x-wav': 'wav',
3371 'filmstrip+json': 'fs',
3372 'svg+xml': 'svg',
3373 }
3374
3375 _, _, subtype = mt.rpartition('/')
3376 ext = SUBTYPE_MAP.get(subtype.lower())
3377 if ext is not None:
3378 return ext
3379
3380 SUFFIX_MAP = {
3381 'json': 'json',
3382 'xml': 'xml',
3383 'zip': 'zip',
3384 'gzip': 'gz',
3385 }
3386
3387 _, _, suffix = subtype.partition('+')
3388 ext = SUFFIX_MAP.get(suffix)
3389 if ext is not None:
3390 return ext
3391
3392 return subtype.replace('+', '.')
3393
3394
3395 def ext2mimetype(ext_or_url):
3396 if not ext_or_url:
3397 return None
3398 if '.' not in ext_or_url:
3399 ext_or_url = f'file.{ext_or_url}'
3400 return mimetypes.guess_type(ext_or_url)[0]
3401
3402
3403 def parse_codecs(codecs_str):
3404 # http://tools.ietf.org/html/rfc6381
3405 if not codecs_str:
3406 return {}
3407 split_codecs = list(filter(None, map(
3408 str.strip, codecs_str.strip().strip(',').split(','))))
3409 vcodec, acodec, scodec, hdr = None, None, None, None
3410 for full_codec in split_codecs:
3411 parts = full_codec.split('.')
3412 codec = parts[0].replace('0', '')
3413 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3414 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3415 if not vcodec:
3416 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3417 if codec in ('dvh1', 'dvhe'):
3418 hdr = 'DV'
3419 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3420 hdr = 'HDR10'
3421 elif full_codec.replace('0', '').startswith('vp9.2'):
3422 hdr = 'HDR10'
3423 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3424 if not acodec:
3425 acodec = full_codec
3426 elif codec in ('stpp', 'wvtt',):
3427 if not scodec:
3428 scodec = full_codec
3429 else:
3430 write_string(f'WARNING: Unknown codec {full_codec}\n')
3431 if vcodec or acodec or scodec:
3432 return {
3433 'vcodec': vcodec or 'none',
3434 'acodec': acodec or 'none',
3435 'dynamic_range': hdr,
3436 **({'scodec': scodec} if scodec is not None else {}),
3437 }
3438 elif len(split_codecs) == 2:
3439 return {
3440 'vcodec': split_codecs[0],
3441 'acodec': split_codecs[1],
3442 }
3443 return {}
3444
3445
3446 def urlhandle_detect_ext(url_handle):
3447 getheader = url_handle.headers.get
3448
3449 cd = getheader('Content-Disposition')
3450 if cd:
3451 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3452 if m:
3453 e = determine_ext(m.group('filename'), default_ext=None)
3454 if e:
3455 return e
3456
3457 return mimetype2ext(getheader('Content-Type'))
3458
3459
3460 def encode_data_uri(data, mime_type):
3461 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3462
3463
3464 def age_restricted(content_limit, age_limit):
3465 """ Returns True iff the content should be blocked """
3466
3467 if age_limit is None: # No limit set
3468 return False
3469 if content_limit is None:
3470 return False # Content available for everyone
3471 return age_limit < content_limit
3472
3473
3474 def is_html(first_bytes):
3475 """ Detect whether a file contains HTML by examining its first bytes. """
3476
3477 BOMS = [
3478 (b'\xef\xbb\xbf', 'utf-8'),
3479 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3480 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3481 (b'\xff\xfe', 'utf-16-le'),
3482 (b'\xfe\xff', 'utf-16-be'),
3483 ]
3484
3485 encoding = 'utf-8'
3486 for bom, enc in BOMS:
3487 while first_bytes.startswith(bom):
3488 encoding, first_bytes = enc, first_bytes[len(bom):]
3489
3490 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3491
3492
3493 def determine_protocol(info_dict):
3494 protocol = info_dict.get('protocol')
3495 if protocol is not None:
3496 return protocol
3497
3498 url = sanitize_url(info_dict['url'])
3499 if url.startswith('rtmp'):
3500 return 'rtmp'
3501 elif url.startswith('mms'):
3502 return 'mms'
3503 elif url.startswith('rtsp'):
3504 return 'rtsp'
3505
3506 ext = determine_ext(url)
3507 if ext == 'm3u8':
3508 return 'm3u8'
3509 elif ext == 'f4m':
3510 return 'f4m'
3511
3512 return compat_urllib_parse_urlparse(url).scheme
3513
3514
3515 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3516 """ Render a list of rows, each as a list of values.
3517 Text after a \t will be right aligned """
3518 def width(string):
3519 return len(remove_terminal_sequences(string).replace('\t', ''))
3520
3521 def get_max_lens(table):
3522 return [max(width(str(v)) for v in col) for col in zip(*table)]
3523
3524 def filter_using_list(row, filterArray):
3525 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3526
3527 max_lens = get_max_lens(data) if hide_empty else []
3528 header_row = filter_using_list(header_row, max_lens)
3529 data = [filter_using_list(row, max_lens) for row in data]
3530
3531 table = [header_row] + data
3532 max_lens = get_max_lens(table)
3533 extra_gap += 1
3534 if delim:
3535 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3536 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3537 for row in table:
3538 for pos, text in enumerate(map(str, row)):
3539 if '\t' in text:
3540 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3541 else:
3542 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3543 ret = '\n'.join(''.join(row).rstrip() for row in table)
3544 return ret
3545
3546
3547 def _match_one(filter_part, dct, incomplete):
3548 # TODO: Generalize code with YoutubeDL._build_format_filter
3549 STRING_OPERATORS = {
3550 '*=': operator.contains,
3551 '^=': lambda attr, value: attr.startswith(value),
3552 '$=': lambda attr, value: attr.endswith(value),
3553 '~=': lambda attr, value: re.search(value, attr),
3554 }
3555 COMPARISON_OPERATORS = {
3556 **STRING_OPERATORS,
3557 '<=': operator.le, # "<=" must be defined above "<"
3558 '<': operator.lt,
3559 '>=': operator.ge,
3560 '>': operator.gt,
3561 '=': operator.eq,
3562 }
3563
3564 if isinstance(incomplete, bool):
3565 is_incomplete = lambda _: incomplete
3566 else:
3567 is_incomplete = lambda k: k in incomplete
3568
3569 operator_rex = re.compile(r'''(?x)
3570 (?P<key>[a-z_]+)
3571 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3572 (?:
3573 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3574 (?P<strval>.+?)
3575 )
3576 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3577 m = operator_rex.fullmatch(filter_part.strip())
3578 if m:
3579 m = m.groupdict()
3580 unnegated_op = COMPARISON_OPERATORS[m['op']]
3581 if m['negation']:
3582 op = lambda attr, value: not unnegated_op(attr, value)
3583 else:
3584 op = unnegated_op
3585 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3586 if m['quote']:
3587 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3588 actual_value = dct.get(m['key'])
3589 numeric_comparison = None
3590 if isinstance(actual_value, (int, float)):
3591 # If the original field is a string and matching comparisonvalue is
3592 # a number we should respect the origin of the original field
3593 # and process comparison value as a string (see
3594 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3595 try:
3596 numeric_comparison = int(comparison_value)
3597 except ValueError:
3598 numeric_comparison = parse_filesize(comparison_value)
3599 if numeric_comparison is None:
3600 numeric_comparison = parse_filesize(f'{comparison_value}B')
3601 if numeric_comparison is None:
3602 numeric_comparison = parse_duration(comparison_value)
3603 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3604 raise ValueError('Operator %s only supports string values!' % m['op'])
3605 if actual_value is None:
3606 return is_incomplete(m['key']) or m['none_inclusive']
3607 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3608
3609 UNARY_OPERATORS = {
3610 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3611 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3612 }
3613 operator_rex = re.compile(r'''(?x)
3614 (?P<op>%s)\s*(?P<key>[a-z_]+)
3615 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3616 m = operator_rex.fullmatch(filter_part.strip())
3617 if m:
3618 op = UNARY_OPERATORS[m.group('op')]
3619 actual_value = dct.get(m.group('key'))
3620 if is_incomplete(m.group('key')) and actual_value is None:
3621 return True
3622 return op(actual_value)
3623
3624 raise ValueError('Invalid filter part %r' % filter_part)
3625
3626
3627 def match_str(filter_str, dct, incomplete=False):
3628 """ Filter a dictionary with a simple string syntax.
3629 @returns Whether the filter passes
3630 @param incomplete Set of keys that is expected to be missing from dct.
3631 Can be True/False to indicate all/none of the keys may be missing.
3632 All conditions on incomplete keys pass if the key is missing
3633 """
3634 return all(
3635 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3636 for filter_part in re.split(r'(?<!\\)&', filter_str))
3637
3638
3639 def match_filter_func(filters):
3640 if not filters:
3641 return None
3642 filters = set(variadic(filters))
3643
3644 interactive = '-' in filters
3645 if interactive:
3646 filters.remove('-')
3647
3648 def _match_func(info_dict, incomplete=False):
3649 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3650 return NO_DEFAULT if interactive and not incomplete else None
3651 else:
3652 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3653 filter_str = ') | ('.join(map(str.strip, filters))
3654 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3655 return _match_func
3656
3657
3658 def download_range_func(chapters, ranges):
3659 def inner(info_dict, ydl):
3660 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3661 else 'Cannot match chapters since chapter information is unavailable')
3662 for regex in chapters or []:
3663 for i, chapter in enumerate(info_dict.get('chapters') or []):
3664 if re.search(regex, chapter['title']):
3665 warning = None
3666 yield {**chapter, 'index': i}
3667 if chapters and warning:
3668 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3669
3670 yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3671
3672 return inner
3673
3674
3675 def parse_dfxp_time_expr(time_expr):
3676 if not time_expr:
3677 return
3678
3679 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3680 if mobj:
3681 return float(mobj.group('time_offset'))
3682
3683 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3684 if mobj:
3685 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3686
3687
3688 def srt_subtitles_timecode(seconds):
3689 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3690
3691
3692 def ass_subtitles_timecode(seconds):
3693 time = timetuple_from_msec(seconds * 1000)
3694 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3695
3696
3697 def dfxp2srt(dfxp_data):
3698 '''
3699 @param dfxp_data A bytes-like object containing DFXP data
3700 @returns A unicode object containing converted SRT data
3701 '''
3702 LEGACY_NAMESPACES = (
3703 (b'http://www.w3.org/ns/ttml', [
3704 b'http://www.w3.org/2004/11/ttaf1',
3705 b'http://www.w3.org/2006/04/ttaf1',
3706 b'http://www.w3.org/2006/10/ttaf1',
3707 ]),
3708 (b'http://www.w3.org/ns/ttml#styling', [
3709 b'http://www.w3.org/ns/ttml#style',
3710 ]),
3711 )
3712
3713 SUPPORTED_STYLING = [
3714 'color',
3715 'fontFamily',
3716 'fontSize',
3717 'fontStyle',
3718 'fontWeight',
3719 'textDecoration'
3720 ]
3721
3722 _x = functools.partial(xpath_with_ns, ns_map={
3723 'xml': 'http://www.w3.org/XML/1998/namespace',
3724 'ttml': 'http://www.w3.org/ns/ttml',
3725 'tts': 'http://www.w3.org/ns/ttml#styling',
3726 })
3727
3728 styles = {}
3729 default_style = {}
3730
3731 class TTMLPElementParser:
3732 _out = ''
3733 _unclosed_elements = []
3734 _applied_styles = []
3735
3736 def start(self, tag, attrib):
3737 if tag in (_x('ttml:br'), 'br'):
3738 self._out += '\n'
3739 else:
3740 unclosed_elements = []
3741 style = {}
3742 element_style_id = attrib.get('style')
3743 if default_style:
3744 style.update(default_style)
3745 if element_style_id:
3746 style.update(styles.get(element_style_id, {}))
3747 for prop in SUPPORTED_STYLING:
3748 prop_val = attrib.get(_x('tts:' + prop))
3749 if prop_val:
3750 style[prop] = prop_val
3751 if style:
3752 font = ''
3753 for k, v in sorted(style.items()):
3754 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3755 continue
3756 if k == 'color':
3757 font += ' color="%s"' % v
3758 elif k == 'fontSize':
3759 font += ' size="%s"' % v
3760 elif k == 'fontFamily':
3761 font += ' face="%s"' % v
3762 elif k == 'fontWeight' and v == 'bold':
3763 self._out += '<b>'
3764 unclosed_elements.append('b')
3765 elif k == 'fontStyle' and v == 'italic':
3766 self._out += '<i>'
3767 unclosed_elements.append('i')
3768 elif k == 'textDecoration' and v == 'underline':
3769 self._out += '<u>'
3770 unclosed_elements.append('u')
3771 if font:
3772 self._out += '<font' + font + '>'
3773 unclosed_elements.append('font')
3774 applied_style = {}
3775 if self._applied_styles:
3776 applied_style.update(self._applied_styles[-1])
3777 applied_style.update(style)
3778 self._applied_styles.append(applied_style)
3779 self._unclosed_elements.append(unclosed_elements)
3780
3781 def end(self, tag):
3782 if tag not in (_x('ttml:br'), 'br'):
3783 unclosed_elements = self._unclosed_elements.pop()
3784 for element in reversed(unclosed_elements):
3785 self._out += '</%s>' % element
3786 if unclosed_elements and self._applied_styles:
3787 self._applied_styles.pop()
3788
3789 def data(self, data):
3790 self._out += data
3791
3792 def close(self):
3793 return self._out.strip()
3794
3795 def parse_node(node):
3796 target = TTMLPElementParser()
3797 parser = xml.etree.ElementTree.XMLParser(target=target)
3798 parser.feed(xml.etree.ElementTree.tostring(node))
3799 return parser.close()
3800
3801 for k, v in LEGACY_NAMESPACES:
3802 for ns in v:
3803 dfxp_data = dfxp_data.replace(ns, k)
3804
3805 dfxp = compat_etree_fromstring(dfxp_data)
3806 out = []
3807 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3808
3809 if not paras:
3810 raise ValueError('Invalid dfxp/TTML subtitle')
3811
3812 repeat = False
3813 while True:
3814 for style in dfxp.findall(_x('.//ttml:style')):
3815 style_id = style.get('id') or style.get(_x('xml:id'))
3816 if not style_id:
3817 continue
3818 parent_style_id = style.get('style')
3819 if parent_style_id:
3820 if parent_style_id not in styles:
3821 repeat = True
3822 continue
3823 styles[style_id] = styles[parent_style_id].copy()
3824 for prop in SUPPORTED_STYLING:
3825 prop_val = style.get(_x('tts:' + prop))
3826 if prop_val:
3827 styles.setdefault(style_id, {})[prop] = prop_val
3828 if repeat:
3829 repeat = False
3830 else:
3831 break
3832
3833 for p in ('body', 'div'):
3834 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3835 if ele is None:
3836 continue
3837 style = styles.get(ele.get('style'))
3838 if not style:
3839 continue
3840 default_style.update(style)
3841
3842 for para, index in zip(paras, itertools.count(1)):
3843 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3844 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3845 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3846 if begin_time is None:
3847 continue
3848 if not end_time:
3849 if not dur:
3850 continue
3851 end_time = begin_time + dur
3852 out.append('%d\n%s --> %s\n%s\n\n' % (
3853 index,
3854 srt_subtitles_timecode(begin_time),
3855 srt_subtitles_timecode(end_time),
3856 parse_node(para)))
3857
3858 return ''.join(out)
3859
3860
3861 def cli_option(params, command_option, param, separator=None):
3862 param = params.get(param)
3863 return ([] if param is None
3864 else [command_option, str(param)] if separator is None
3865 else [f'{command_option}{separator}{param}'])
3866
3867
3868 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3869 param = params.get(param)
3870 assert param in (True, False, None)
3871 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3872
3873
3874 def cli_valueless_option(params, command_option, param, expected_value=True):
3875 return [command_option] if params.get(param) == expected_value else []
3876
3877
3878 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3879 if isinstance(argdict, (list, tuple)): # for backward compatibility
3880 if use_compat:
3881 return argdict
3882 else:
3883 argdict = None
3884 if argdict is None:
3885 return default
3886 assert isinstance(argdict, dict)
3887
3888 assert isinstance(keys, (list, tuple))
3889 for key_list in keys:
3890 arg_list = list(filter(
3891 lambda x: x is not None,
3892 [argdict.get(key.lower()) for key in variadic(key_list)]))
3893 if arg_list:
3894 return [arg for args in arg_list for arg in args]
3895 return default
3896
3897
3898 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3899 main_key, exe = main_key.lower(), exe.lower()
3900 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3901 keys = [f'{root_key}{k}' for k in (keys or [''])]
3902 if root_key in keys:
3903 if main_key != exe:
3904 keys.append((main_key, exe))
3905 keys.append('default')
3906 else:
3907 use_compat = False
3908 return cli_configuration_args(argdict, keys, default, use_compat)
3909
3910
3911 class ISO639Utils:
3912 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3913 _lang_map = {
3914 'aa': 'aar',
3915 'ab': 'abk',
3916 'ae': 'ave',
3917 'af': 'afr',
3918 'ak': 'aka',
3919 'am': 'amh',
3920 'an': 'arg',
3921 'ar': 'ara',
3922 'as': 'asm',
3923 'av': 'ava',
3924 'ay': 'aym',
3925 'az': 'aze',
3926 'ba': 'bak',
3927 'be': 'bel',
3928 'bg': 'bul',
3929 'bh': 'bih',
3930 'bi': 'bis',
3931 'bm': 'bam',
3932 'bn': 'ben',
3933 'bo': 'bod',
3934 'br': 'bre',
3935 'bs': 'bos',
3936 'ca': 'cat',
3937 'ce': 'che',
3938 'ch': 'cha',
3939 'co': 'cos',
3940 'cr': 'cre',
3941 'cs': 'ces',
3942 'cu': 'chu',
3943 'cv': 'chv',
3944 'cy': 'cym',
3945 'da': 'dan',
3946 'de': 'deu',
3947 'dv': 'div',
3948 'dz': 'dzo',
3949 'ee': 'ewe',
3950 'el': 'ell',
3951 'en': 'eng',
3952 'eo': 'epo',
3953 'es': 'spa',
3954 'et': 'est',
3955 'eu': 'eus',
3956 'fa': 'fas',
3957 'ff': 'ful',
3958 'fi': 'fin',
3959 'fj': 'fij',
3960 'fo': 'fao',
3961 'fr': 'fra',
3962 'fy': 'fry',
3963 'ga': 'gle',
3964 'gd': 'gla',
3965 'gl': 'glg',
3966 'gn': 'grn',
3967 'gu': 'guj',
3968 'gv': 'glv',
3969 'ha': 'hau',
3970 'he': 'heb',
3971 'iw': 'heb', # Replaced by he in 1989 revision
3972 'hi': 'hin',
3973 'ho': 'hmo',
3974 'hr': 'hrv',
3975 'ht': 'hat',
3976 'hu': 'hun',
3977 'hy': 'hye',
3978 'hz': 'her',
3979 'ia': 'ina',
3980 'id': 'ind',
3981 'in': 'ind', # Replaced by id in 1989 revision
3982 'ie': 'ile',
3983 'ig': 'ibo',
3984 'ii': 'iii',
3985 'ik': 'ipk',
3986 'io': 'ido',
3987 'is': 'isl',
3988 'it': 'ita',
3989 'iu': 'iku',
3990 'ja': 'jpn',
3991 'jv': 'jav',
3992 'ka': 'kat',
3993 'kg': 'kon',
3994 'ki': 'kik',
3995 'kj': 'kua',
3996 'kk': 'kaz',
3997 'kl': 'kal',
3998 'km': 'khm',
3999 'kn': 'kan',
4000 'ko': 'kor',
4001 'kr': 'kau',
4002 'ks': 'kas',
4003 'ku': 'kur',
4004 'kv': 'kom',
4005 'kw': 'cor',
4006 'ky': 'kir',
4007 'la': 'lat',
4008 'lb': 'ltz',
4009 'lg': 'lug',
4010 'li': 'lim',
4011 'ln': 'lin',
4012 'lo': 'lao',
4013 'lt': 'lit',
4014 'lu': 'lub',
4015 'lv': 'lav',
4016 'mg': 'mlg',
4017 'mh': 'mah',
4018 'mi': 'mri',
4019 'mk': 'mkd',
4020 'ml': 'mal',
4021 'mn': 'mon',
4022 'mr': 'mar',
4023 'ms': 'msa',
4024 'mt': 'mlt',
4025 'my': 'mya',
4026 'na': 'nau',
4027 'nb': 'nob',
4028 'nd': 'nde',
4029 'ne': 'nep',
4030 'ng': 'ndo',
4031 'nl': 'nld',
4032 'nn': 'nno',
4033 'no': 'nor',
4034 'nr': 'nbl',
4035 'nv': 'nav',
4036 'ny': 'nya',
4037 'oc': 'oci',
4038 'oj': 'oji',
4039 'om': 'orm',
4040 'or': 'ori',
4041 'os': 'oss',
4042 'pa': 'pan',
4043 'pi': 'pli',
4044 'pl': 'pol',
4045 'ps': 'pus',
4046 'pt': 'por',
4047 'qu': 'que',
4048 'rm': 'roh',
4049 'rn': 'run',
4050 'ro': 'ron',
4051 'ru': 'rus',
4052 'rw': 'kin',
4053 'sa': 'san',
4054 'sc': 'srd',
4055 'sd': 'snd',
4056 'se': 'sme',
4057 'sg': 'sag',
4058 'si': 'sin',
4059 'sk': 'slk',
4060 'sl': 'slv',
4061 'sm': 'smo',
4062 'sn': 'sna',
4063 'so': 'som',
4064 'sq': 'sqi',
4065 'sr': 'srp',
4066 'ss': 'ssw',
4067 'st': 'sot',
4068 'su': 'sun',
4069 'sv': 'swe',
4070 'sw': 'swa',
4071 'ta': 'tam',
4072 'te': 'tel',
4073 'tg': 'tgk',
4074 'th': 'tha',
4075 'ti': 'tir',
4076 'tk': 'tuk',
4077 'tl': 'tgl',
4078 'tn': 'tsn',
4079 'to': 'ton',
4080 'tr': 'tur',
4081 'ts': 'tso',
4082 'tt': 'tat',
4083 'tw': 'twi',
4084 'ty': 'tah',
4085 'ug': 'uig',
4086 'uk': 'ukr',
4087 'ur': 'urd',
4088 'uz': 'uzb',
4089 've': 'ven',
4090 'vi': 'vie',
4091 'vo': 'vol',
4092 'wa': 'wln',
4093 'wo': 'wol',
4094 'xh': 'xho',
4095 'yi': 'yid',
4096 'ji': 'yid', # Replaced by yi in 1989 revision
4097 'yo': 'yor',
4098 'za': 'zha',
4099 'zh': 'zho',
4100 'zu': 'zul',
4101 }
4102
4103 @classmethod
4104 def short2long(cls, code):
4105 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4106 return cls._lang_map.get(code[:2])
4107
4108 @classmethod
4109 def long2short(cls, code):
4110 """Convert language code from ISO 639-2/T to ISO 639-1"""
4111 for short_name, long_name in cls._lang_map.items():
4112 if long_name == code:
4113 return short_name
4114
4115
4116 class ISO3166Utils:
4117 # From http://data.okfn.org/data/core/country-list
4118 _country_map = {
4119 'AF': 'Afghanistan',
4120 'AX': 'Åland Islands',
4121 'AL': 'Albania',
4122 'DZ': 'Algeria',
4123 'AS': 'American Samoa',
4124 'AD': 'Andorra',
4125 'AO': 'Angola',
4126 'AI': 'Anguilla',
4127 'AQ': 'Antarctica',
4128 'AG': 'Antigua and Barbuda',
4129 'AR': 'Argentina',
4130 'AM': 'Armenia',
4131 'AW': 'Aruba',
4132 'AU': 'Australia',
4133 'AT': 'Austria',
4134 'AZ': 'Azerbaijan',
4135 'BS': 'Bahamas',
4136 'BH': 'Bahrain',
4137 'BD': 'Bangladesh',
4138 'BB': 'Barbados',
4139 'BY': 'Belarus',
4140 'BE': 'Belgium',
4141 'BZ': 'Belize',
4142 'BJ': 'Benin',
4143 'BM': 'Bermuda',
4144 'BT': 'Bhutan',
4145 'BO': 'Bolivia, Plurinational State of',
4146 'BQ': 'Bonaire, Sint Eustatius and Saba',
4147 'BA': 'Bosnia and Herzegovina',
4148 'BW': 'Botswana',
4149 'BV': 'Bouvet Island',
4150 'BR': 'Brazil',
4151 'IO': 'British Indian Ocean Territory',
4152 'BN': 'Brunei Darussalam',
4153 'BG': 'Bulgaria',
4154 'BF': 'Burkina Faso',
4155 'BI': 'Burundi',
4156 'KH': 'Cambodia',
4157 'CM': 'Cameroon',
4158 'CA': 'Canada',
4159 'CV': 'Cape Verde',
4160 'KY': 'Cayman Islands',
4161 'CF': 'Central African Republic',
4162 'TD': 'Chad',
4163 'CL': 'Chile',
4164 'CN': 'China',
4165 'CX': 'Christmas Island',
4166 'CC': 'Cocos (Keeling) Islands',
4167 'CO': 'Colombia',
4168 'KM': 'Comoros',
4169 'CG': 'Congo',
4170 'CD': 'Congo, the Democratic Republic of the',
4171 'CK': 'Cook Islands',
4172 'CR': 'Costa Rica',
4173 'CI': 'Côte d\'Ivoire',
4174 'HR': 'Croatia',
4175 'CU': 'Cuba',
4176 'CW': 'Curaçao',
4177 'CY': 'Cyprus',
4178 'CZ': 'Czech Republic',
4179 'DK': 'Denmark',
4180 'DJ': 'Djibouti',
4181 'DM': 'Dominica',
4182 'DO': 'Dominican Republic',
4183 'EC': 'Ecuador',
4184 'EG': 'Egypt',
4185 'SV': 'El Salvador',
4186 'GQ': 'Equatorial Guinea',
4187 'ER': 'Eritrea',
4188 'EE': 'Estonia',
4189 'ET': 'Ethiopia',
4190 'FK': 'Falkland Islands (Malvinas)',
4191 'FO': 'Faroe Islands',
4192 'FJ': 'Fiji',
4193 'FI': 'Finland',
4194 'FR': 'France',
4195 'GF': 'French Guiana',
4196 'PF': 'French Polynesia',
4197 'TF': 'French Southern Territories',
4198 'GA': 'Gabon',
4199 'GM': 'Gambia',
4200 'GE': 'Georgia',
4201 'DE': 'Germany',
4202 'GH': 'Ghana',
4203 'GI': 'Gibraltar',
4204 'GR': 'Greece',
4205 'GL': 'Greenland',
4206 'GD': 'Grenada',
4207 'GP': 'Guadeloupe',
4208 'GU': 'Guam',
4209 'GT': 'Guatemala',
4210 'GG': 'Guernsey',
4211 'GN': 'Guinea',
4212 'GW': 'Guinea-Bissau',
4213 'GY': 'Guyana',
4214 'HT': 'Haiti',
4215 'HM': 'Heard Island and McDonald Islands',
4216 'VA': 'Holy See (Vatican City State)',
4217 'HN': 'Honduras',
4218 'HK': 'Hong Kong',
4219 'HU': 'Hungary',
4220 'IS': 'Iceland',
4221 'IN': 'India',
4222 'ID': 'Indonesia',
4223 'IR': 'Iran, Islamic Republic of',
4224 'IQ': 'Iraq',
4225 'IE': 'Ireland',
4226 'IM': 'Isle of Man',
4227 'IL': 'Israel',
4228 'IT': 'Italy',
4229 'JM': 'Jamaica',
4230 'JP': 'Japan',
4231 'JE': 'Jersey',
4232 'JO': 'Jordan',
4233 'KZ': 'Kazakhstan',
4234 'KE': 'Kenya',
4235 'KI': 'Kiribati',
4236 'KP': 'Korea, Democratic People\'s Republic of',
4237 'KR': 'Korea, Republic of',
4238 'KW': 'Kuwait',
4239 'KG': 'Kyrgyzstan',
4240 'LA': 'Lao People\'s Democratic Republic',
4241 'LV': 'Latvia',
4242 'LB': 'Lebanon',
4243 'LS': 'Lesotho',
4244 'LR': 'Liberia',
4245 'LY': 'Libya',
4246 'LI': 'Liechtenstein',
4247 'LT': 'Lithuania',
4248 'LU': 'Luxembourg',
4249 'MO': 'Macao',
4250 'MK': 'Macedonia, the Former Yugoslav Republic of',
4251 'MG': 'Madagascar',
4252 'MW': 'Malawi',
4253 'MY': 'Malaysia',
4254 'MV': 'Maldives',
4255 'ML': 'Mali',
4256 'MT': 'Malta',
4257 'MH': 'Marshall Islands',
4258 'MQ': 'Martinique',
4259 'MR': 'Mauritania',
4260 'MU': 'Mauritius',
4261 'YT': 'Mayotte',
4262 'MX': 'Mexico',
4263 'FM': 'Micronesia, Federated States of',
4264 'MD': 'Moldova, Republic of',
4265 'MC': 'Monaco',
4266 'MN': 'Mongolia',
4267 'ME': 'Montenegro',
4268 'MS': 'Montserrat',
4269 'MA': 'Morocco',
4270 'MZ': 'Mozambique',
4271 'MM': 'Myanmar',
4272 'NA': 'Namibia',
4273 'NR': 'Nauru',
4274 'NP': 'Nepal',
4275 'NL': 'Netherlands',
4276 'NC': 'New Caledonia',
4277 'NZ': 'New Zealand',
4278 'NI': 'Nicaragua',
4279 'NE': 'Niger',
4280 'NG': 'Nigeria',
4281 'NU': 'Niue',
4282 'NF': 'Norfolk Island',
4283 'MP': 'Northern Mariana Islands',
4284 'NO': 'Norway',
4285 'OM': 'Oman',
4286 'PK': 'Pakistan',
4287 'PW': 'Palau',
4288 'PS': 'Palestine, State of',
4289 'PA': 'Panama',
4290 'PG': 'Papua New Guinea',
4291 'PY': 'Paraguay',
4292 'PE': 'Peru',
4293 'PH': 'Philippines',
4294 'PN': 'Pitcairn',
4295 'PL': 'Poland',
4296 'PT': 'Portugal',
4297 'PR': 'Puerto Rico',
4298 'QA': 'Qatar',
4299 'RE': 'Réunion',
4300 'RO': 'Romania',
4301 'RU': 'Russian Federation',
4302 'RW': 'Rwanda',
4303 'BL': 'Saint Barthélemy',
4304 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4305 'KN': 'Saint Kitts and Nevis',
4306 'LC': 'Saint Lucia',
4307 'MF': 'Saint Martin (French part)',
4308 'PM': 'Saint Pierre and Miquelon',
4309 'VC': 'Saint Vincent and the Grenadines',
4310 'WS': 'Samoa',
4311 'SM': 'San Marino',
4312 'ST': 'Sao Tome and Principe',
4313 'SA': 'Saudi Arabia',
4314 'SN': 'Senegal',
4315 'RS': 'Serbia',
4316 'SC': 'Seychelles',
4317 'SL': 'Sierra Leone',
4318 'SG': 'Singapore',
4319 'SX': 'Sint Maarten (Dutch part)',
4320 'SK': 'Slovakia',
4321 'SI': 'Slovenia',
4322 'SB': 'Solomon Islands',
4323 'SO': 'Somalia',
4324 'ZA': 'South Africa',
4325 'GS': 'South Georgia and the South Sandwich Islands',
4326 'SS': 'South Sudan',
4327 'ES': 'Spain',
4328 'LK': 'Sri Lanka',
4329 'SD': 'Sudan',
4330 'SR': 'Suriname',
4331 'SJ': 'Svalbard and Jan Mayen',
4332 'SZ': 'Swaziland',
4333 'SE': 'Sweden',
4334 'CH': 'Switzerland',
4335 'SY': 'Syrian Arab Republic',
4336 'TW': 'Taiwan, Province of China',
4337 'TJ': 'Tajikistan',
4338 'TZ': 'Tanzania, United Republic of',
4339 'TH': 'Thailand',
4340 'TL': 'Timor-Leste',
4341 'TG': 'Togo',
4342 'TK': 'Tokelau',
4343 'TO': 'Tonga',
4344 'TT': 'Trinidad and Tobago',
4345 'TN': 'Tunisia',
4346 'TR': 'Turkey',
4347 'TM': 'Turkmenistan',
4348 'TC': 'Turks and Caicos Islands',
4349 'TV': 'Tuvalu',
4350 'UG': 'Uganda',
4351 'UA': 'Ukraine',
4352 'AE': 'United Arab Emirates',
4353 'GB': 'United Kingdom',
4354 'US': 'United States',
4355 'UM': 'United States Minor Outlying Islands',
4356 'UY': 'Uruguay',
4357 'UZ': 'Uzbekistan',
4358 'VU': 'Vanuatu',
4359 'VE': 'Venezuela, Bolivarian Republic of',
4360 'VN': 'Viet Nam',
4361 'VG': 'Virgin Islands, British',
4362 'VI': 'Virgin Islands, U.S.',
4363 'WF': 'Wallis and Futuna',
4364 'EH': 'Western Sahara',
4365 'YE': 'Yemen',
4366 'ZM': 'Zambia',
4367 'ZW': 'Zimbabwe',
4368 # Not ISO 3166 codes, but used for IP blocks
4369 'AP': 'Asia/Pacific Region',
4370 'EU': 'Europe',
4371 }
4372
4373 @classmethod
4374 def short2full(cls, code):
4375 """Convert an ISO 3166-2 country code to the corresponding full name"""
4376 return cls._country_map.get(code.upper())
4377
4378
4379 class GeoUtils:
4380 # Major IPv4 address blocks per country
4381 _country_ip_map = {
4382 'AD': '46.172.224.0/19',
4383 'AE': '94.200.0.0/13',
4384 'AF': '149.54.0.0/17',
4385 'AG': '209.59.64.0/18',
4386 'AI': '204.14.248.0/21',
4387 'AL': '46.99.0.0/16',
4388 'AM': '46.70.0.0/15',
4389 'AO': '105.168.0.0/13',
4390 'AP': '182.50.184.0/21',
4391 'AQ': '23.154.160.0/24',
4392 'AR': '181.0.0.0/12',
4393 'AS': '202.70.112.0/20',
4394 'AT': '77.116.0.0/14',
4395 'AU': '1.128.0.0/11',
4396 'AW': '181.41.0.0/18',
4397 'AX': '185.217.4.0/22',
4398 'AZ': '5.197.0.0/16',
4399 'BA': '31.176.128.0/17',
4400 'BB': '65.48.128.0/17',
4401 'BD': '114.130.0.0/16',
4402 'BE': '57.0.0.0/8',
4403 'BF': '102.178.0.0/15',
4404 'BG': '95.42.0.0/15',
4405 'BH': '37.131.0.0/17',
4406 'BI': '154.117.192.0/18',
4407 'BJ': '137.255.0.0/16',
4408 'BL': '185.212.72.0/23',
4409 'BM': '196.12.64.0/18',
4410 'BN': '156.31.0.0/16',
4411 'BO': '161.56.0.0/16',
4412 'BQ': '161.0.80.0/20',
4413 'BR': '191.128.0.0/12',
4414 'BS': '24.51.64.0/18',
4415 'BT': '119.2.96.0/19',
4416 'BW': '168.167.0.0/16',
4417 'BY': '178.120.0.0/13',
4418 'BZ': '179.42.192.0/18',
4419 'CA': '99.224.0.0/11',
4420 'CD': '41.243.0.0/16',
4421 'CF': '197.242.176.0/21',
4422 'CG': '160.113.0.0/16',
4423 'CH': '85.0.0.0/13',
4424 'CI': '102.136.0.0/14',
4425 'CK': '202.65.32.0/19',
4426 'CL': '152.172.0.0/14',
4427 'CM': '102.244.0.0/14',
4428 'CN': '36.128.0.0/10',
4429 'CO': '181.240.0.0/12',
4430 'CR': '201.192.0.0/12',
4431 'CU': '152.206.0.0/15',
4432 'CV': '165.90.96.0/19',
4433 'CW': '190.88.128.0/17',
4434 'CY': '31.153.0.0/16',
4435 'CZ': '88.100.0.0/14',
4436 'DE': '53.0.0.0/8',
4437 'DJ': '197.241.0.0/17',
4438 'DK': '87.48.0.0/12',
4439 'DM': '192.243.48.0/20',
4440 'DO': '152.166.0.0/15',
4441 'DZ': '41.96.0.0/12',
4442 'EC': '186.68.0.0/15',
4443 'EE': '90.190.0.0/15',
4444 'EG': '156.160.0.0/11',
4445 'ER': '196.200.96.0/20',
4446 'ES': '88.0.0.0/11',
4447 'ET': '196.188.0.0/14',
4448 'EU': '2.16.0.0/13',
4449 'FI': '91.152.0.0/13',
4450 'FJ': '144.120.0.0/16',
4451 'FK': '80.73.208.0/21',
4452 'FM': '119.252.112.0/20',
4453 'FO': '88.85.32.0/19',
4454 'FR': '90.0.0.0/9',
4455 'GA': '41.158.0.0/15',
4456 'GB': '25.0.0.0/8',
4457 'GD': '74.122.88.0/21',
4458 'GE': '31.146.0.0/16',
4459 'GF': '161.22.64.0/18',
4460 'GG': '62.68.160.0/19',
4461 'GH': '154.160.0.0/12',
4462 'GI': '95.164.0.0/16',
4463 'GL': '88.83.0.0/19',
4464 'GM': '160.182.0.0/15',
4465 'GN': '197.149.192.0/18',
4466 'GP': '104.250.0.0/19',
4467 'GQ': '105.235.224.0/20',
4468 'GR': '94.64.0.0/13',
4469 'GT': '168.234.0.0/16',
4470 'GU': '168.123.0.0/16',
4471 'GW': '197.214.80.0/20',
4472 'GY': '181.41.64.0/18',
4473 'HK': '113.252.0.0/14',
4474 'HN': '181.210.0.0/16',
4475 'HR': '93.136.0.0/13',
4476 'HT': '148.102.128.0/17',
4477 'HU': '84.0.0.0/14',
4478 'ID': '39.192.0.0/10',
4479 'IE': '87.32.0.0/12',
4480 'IL': '79.176.0.0/13',
4481 'IM': '5.62.80.0/20',
4482 'IN': '117.192.0.0/10',
4483 'IO': '203.83.48.0/21',
4484 'IQ': '37.236.0.0/14',
4485 'IR': '2.176.0.0/12',
4486 'IS': '82.221.0.0/16',
4487 'IT': '79.0.0.0/10',
4488 'JE': '87.244.64.0/18',
4489 'JM': '72.27.0.0/17',
4490 'JO': '176.29.0.0/16',
4491 'JP': '133.0.0.0/8',
4492 'KE': '105.48.0.0/12',
4493 'KG': '158.181.128.0/17',
4494 'KH': '36.37.128.0/17',
4495 'KI': '103.25.140.0/22',
4496 'KM': '197.255.224.0/20',
4497 'KN': '198.167.192.0/19',
4498 'KP': '175.45.176.0/22',
4499 'KR': '175.192.0.0/10',
4500 'KW': '37.36.0.0/14',
4501 'KY': '64.96.0.0/15',
4502 'KZ': '2.72.0.0/13',
4503 'LA': '115.84.64.0/18',
4504 'LB': '178.135.0.0/16',
4505 'LC': '24.92.144.0/20',
4506 'LI': '82.117.0.0/19',
4507 'LK': '112.134.0.0/15',
4508 'LR': '102.183.0.0/16',
4509 'LS': '129.232.0.0/17',
4510 'LT': '78.56.0.0/13',
4511 'LU': '188.42.0.0/16',
4512 'LV': '46.109.0.0/16',
4513 'LY': '41.252.0.0/14',
4514 'MA': '105.128.0.0/11',
4515 'MC': '88.209.64.0/18',
4516 'MD': '37.246.0.0/16',
4517 'ME': '178.175.0.0/17',
4518 'MF': '74.112.232.0/21',
4519 'MG': '154.126.0.0/17',
4520 'MH': '117.103.88.0/21',
4521 'MK': '77.28.0.0/15',
4522 'ML': '154.118.128.0/18',
4523 'MM': '37.111.0.0/17',
4524 'MN': '49.0.128.0/17',
4525 'MO': '60.246.0.0/16',
4526 'MP': '202.88.64.0/20',
4527 'MQ': '109.203.224.0/19',
4528 'MR': '41.188.64.0/18',
4529 'MS': '208.90.112.0/22',
4530 'MT': '46.11.0.0/16',
4531 'MU': '105.16.0.0/12',
4532 'MV': '27.114.128.0/18',
4533 'MW': '102.70.0.0/15',
4534 'MX': '187.192.0.0/11',
4535 'MY': '175.136.0.0/13',
4536 'MZ': '197.218.0.0/15',
4537 'NA': '41.182.0.0/16',
4538 'NC': '101.101.0.0/18',
4539 'NE': '197.214.0.0/18',
4540 'NF': '203.17.240.0/22',
4541 'NG': '105.112.0.0/12',
4542 'NI': '186.76.0.0/15',
4543 'NL': '145.96.0.0/11',
4544 'NO': '84.208.0.0/13',
4545 'NP': '36.252.0.0/15',
4546 'NR': '203.98.224.0/19',
4547 'NU': '49.156.48.0/22',
4548 'NZ': '49.224.0.0/14',
4549 'OM': '5.36.0.0/15',
4550 'PA': '186.72.0.0/15',
4551 'PE': '186.160.0.0/14',
4552 'PF': '123.50.64.0/18',
4553 'PG': '124.240.192.0/19',
4554 'PH': '49.144.0.0/13',
4555 'PK': '39.32.0.0/11',
4556 'PL': '83.0.0.0/11',
4557 'PM': '70.36.0.0/20',
4558 'PR': '66.50.0.0/16',
4559 'PS': '188.161.0.0/16',
4560 'PT': '85.240.0.0/13',
4561 'PW': '202.124.224.0/20',
4562 'PY': '181.120.0.0/14',
4563 'QA': '37.210.0.0/15',
4564 'RE': '102.35.0.0/16',
4565 'RO': '79.112.0.0/13',
4566 'RS': '93.86.0.0/15',
4567 'RU': '5.136.0.0/13',
4568 'RW': '41.186.0.0/16',
4569 'SA': '188.48.0.0/13',
4570 'SB': '202.1.160.0/19',
4571 'SC': '154.192.0.0/11',
4572 'SD': '102.120.0.0/13',
4573 'SE': '78.64.0.0/12',
4574 'SG': '8.128.0.0/10',
4575 'SI': '188.196.0.0/14',
4576 'SK': '78.98.0.0/15',
4577 'SL': '102.143.0.0/17',
4578 'SM': '89.186.32.0/19',
4579 'SN': '41.82.0.0/15',
4580 'SO': '154.115.192.0/18',
4581 'SR': '186.179.128.0/17',
4582 'SS': '105.235.208.0/21',
4583 'ST': '197.159.160.0/19',
4584 'SV': '168.243.0.0/16',
4585 'SX': '190.102.0.0/20',
4586 'SY': '5.0.0.0/16',
4587 'SZ': '41.84.224.0/19',
4588 'TC': '65.255.48.0/20',
4589 'TD': '154.68.128.0/19',
4590 'TG': '196.168.0.0/14',
4591 'TH': '171.96.0.0/13',
4592 'TJ': '85.9.128.0/18',
4593 'TK': '27.96.24.0/21',
4594 'TL': '180.189.160.0/20',
4595 'TM': '95.85.96.0/19',
4596 'TN': '197.0.0.0/11',
4597 'TO': '175.176.144.0/21',
4598 'TR': '78.160.0.0/11',
4599 'TT': '186.44.0.0/15',
4600 'TV': '202.2.96.0/19',
4601 'TW': '120.96.0.0/11',
4602 'TZ': '156.156.0.0/14',
4603 'UA': '37.52.0.0/14',
4604 'UG': '102.80.0.0/13',
4605 'US': '6.0.0.0/8',
4606 'UY': '167.56.0.0/13',
4607 'UZ': '84.54.64.0/18',
4608 'VA': '212.77.0.0/19',
4609 'VC': '207.191.240.0/21',
4610 'VE': '186.88.0.0/13',
4611 'VG': '66.81.192.0/20',
4612 'VI': '146.226.0.0/16',
4613 'VN': '14.160.0.0/11',
4614 'VU': '202.80.32.0/20',
4615 'WF': '117.20.32.0/21',
4616 'WS': '202.4.32.0/19',
4617 'YE': '134.35.0.0/16',
4618 'YT': '41.242.116.0/22',
4619 'ZA': '41.0.0.0/11',
4620 'ZM': '102.144.0.0/13',
4621 'ZW': '102.177.192.0/18',
4622 }
4623
4624 @classmethod
4625 def random_ipv4(cls, code_or_block):
4626 if len(code_or_block) == 2:
4627 block = cls._country_ip_map.get(code_or_block.upper())
4628 if not block:
4629 return None
4630 else:
4631 block = code_or_block
4632 addr, preflen = block.split('/')
4633 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4634 addr_max = addr_min | (0xffffffff >> int(preflen))
4635 return compat_str(socket.inet_ntoa(
4636 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4637
4638
4639 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4640 def __init__(self, proxies=None):
4641 # Set default handlers
4642 for type in ('http', 'https'):
4643 setattr(self, '%s_open' % type,
4644 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4645 meth(r, proxy, type))
4646 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4647
4648 def proxy_open(self, req, proxy, type):
4649 req_proxy = req.headers.get('Ytdl-request-proxy')
4650 if req_proxy is not None:
4651 proxy = req_proxy
4652 del req.headers['Ytdl-request-proxy']
4653
4654 if proxy == '__noproxy__':
4655 return None # No Proxy
4656 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4657 req.add_header('Ytdl-socks-proxy', proxy)
4658 # yt-dlp's http/https handlers do wrapping the socket with socks
4659 return None
4660 return compat_urllib_request.ProxyHandler.proxy_open(
4661 self, req, proxy, type)
4662
4663
4664 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4665 # released into Public Domain
4666 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4667
4668 def long_to_bytes(n, blocksize=0):
4669 """long_to_bytes(n:long, blocksize:int) : string
4670 Convert a long integer to a byte string.
4671
4672 If optional blocksize is given and greater than zero, pad the front of the
4673 byte string with binary zeros so that the length is a multiple of
4674 blocksize.
4675 """
4676 # after much testing, this algorithm was deemed to be the fastest
4677 s = b''
4678 n = int(n)
4679 while n > 0:
4680 s = compat_struct_pack('>I', n & 0xffffffff) + s
4681 n = n >> 32
4682 # strip off leading zeros
4683 for i in range(len(s)):
4684 if s[i] != b'\000'[0]:
4685 break
4686 else:
4687 # only happens when n == 0
4688 s = b'\000'
4689 i = 0
4690 s = s[i:]
4691 # add back some pad bytes. this could be done more efficiently w.r.t. the
4692 # de-padding being done above, but sigh...
4693 if blocksize > 0 and len(s) % blocksize:
4694 s = (blocksize - len(s) % blocksize) * b'\000' + s
4695 return s
4696
4697
4698 def bytes_to_long(s):
4699 """bytes_to_long(string) : long
4700 Convert a byte string to a long integer.
4701
4702 This is (essentially) the inverse of long_to_bytes().
4703 """
4704 acc = 0
4705 length = len(s)
4706 if length % 4:
4707 extra = (4 - length % 4)
4708 s = b'\000' * extra + s
4709 length = length + extra
4710 for i in range(0, length, 4):
4711 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4712 return acc
4713
4714
4715 def ohdave_rsa_encrypt(data, exponent, modulus):
4716 '''
4717 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4718
4719 Input:
4720 data: data to encrypt, bytes-like object
4721 exponent, modulus: parameter e and N of RSA algorithm, both integer
4722 Output: hex string of encrypted data
4723
4724 Limitation: supports one block encryption only
4725 '''
4726
4727 payload = int(binascii.hexlify(data[::-1]), 16)
4728 encrypted = pow(payload, exponent, modulus)
4729 return '%x' % encrypted
4730
4731
4732 def pkcs1pad(data, length):
4733 """
4734 Padding input data with PKCS#1 scheme
4735
4736 @param {int[]} data input data
4737 @param {int} length target length
4738 @returns {int[]} padded data
4739 """
4740 if len(data) > length - 11:
4741 raise ValueError('Input data too long for PKCS#1 padding')
4742
4743 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4744 return [0, 2] + pseudo_random + [0] + data
4745
4746
4747 def encode_base_n(num, n, table=None):
4748 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4749 if not table:
4750 table = FULL_TABLE[:n]
4751
4752 if n > len(table):
4753 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4754
4755 if num == 0:
4756 return table[0]
4757
4758 ret = ''
4759 while num:
4760 ret = table[num % n] + ret
4761 num = num // n
4762 return ret
4763
4764
4765 def decode_packed_codes(code):
4766 mobj = re.search(PACKED_CODES_RE, code)
4767 obfuscated_code, base, count, symbols = mobj.groups()
4768 base = int(base)
4769 count = int(count)
4770 symbols = symbols.split('|')
4771 symbol_table = {}
4772
4773 while count:
4774 count -= 1
4775 base_n_count = encode_base_n(count, base)
4776 symbol_table[base_n_count] = symbols[count] or base_n_count
4777
4778 return re.sub(
4779 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4780 obfuscated_code)
4781
4782
4783 def caesar(s, alphabet, shift):
4784 if shift == 0:
4785 return s
4786 l = len(alphabet)
4787 return ''.join(
4788 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4789 for c in s)
4790
4791
4792 def rot47(s):
4793 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4794
4795
4796 def parse_m3u8_attributes(attrib):
4797 info = {}
4798 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4799 if val.startswith('"'):
4800 val = val[1:-1]
4801 info[key] = val
4802 return info
4803
4804
4805 def urshift(val, n):
4806 return val >> n if val >= 0 else (val + 0x100000000) >> n
4807
4808
4809 # Based on png2str() written by @gdkchan and improved by @yokrysty
4810 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4811 def decode_png(png_data):
4812 # Reference: https://www.w3.org/TR/PNG/
4813 header = png_data[8:]
4814
4815 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4816 raise OSError('Not a valid PNG file.')
4817
4818 int_map = {1: '>B', 2: '>H', 4: '>I'}
4819 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4820
4821 chunks = []
4822
4823 while header:
4824 length = unpack_integer(header[:4])
4825 header = header[4:]
4826
4827 chunk_type = header[:4]
4828 header = header[4:]
4829
4830 chunk_data = header[:length]
4831 header = header[length:]
4832
4833 header = header[4:] # Skip CRC
4834
4835 chunks.append({
4836 'type': chunk_type,
4837 'length': length,
4838 'data': chunk_data
4839 })
4840
4841 ihdr = chunks[0]['data']
4842
4843 width = unpack_integer(ihdr[:4])
4844 height = unpack_integer(ihdr[4:8])
4845
4846 idat = b''
4847
4848 for chunk in chunks:
4849 if chunk['type'] == b'IDAT':
4850 idat += chunk['data']
4851
4852 if not idat:
4853 raise OSError('Unable to read PNG data.')
4854
4855 decompressed_data = bytearray(zlib.decompress(idat))
4856
4857 stride = width * 3
4858 pixels = []
4859
4860 def _get_pixel(idx):
4861 x = idx % stride
4862 y = idx // stride
4863 return pixels[y][x]
4864
4865 for y in range(height):
4866 basePos = y * (1 + stride)
4867 filter_type = decompressed_data[basePos]
4868
4869 current_row = []
4870
4871 pixels.append(current_row)
4872
4873 for x in range(stride):
4874 color = decompressed_data[1 + basePos + x]
4875 basex = y * stride + x
4876 left = 0
4877 up = 0
4878
4879 if x > 2:
4880 left = _get_pixel(basex - 3)
4881 if y > 0:
4882 up = _get_pixel(basex - stride)
4883
4884 if filter_type == 1: # Sub
4885 color = (color + left) & 0xff
4886 elif filter_type == 2: # Up
4887 color = (color + up) & 0xff
4888 elif filter_type == 3: # Average
4889 color = (color + ((left + up) >> 1)) & 0xff
4890 elif filter_type == 4: # Paeth
4891 a = left
4892 b = up
4893 c = 0
4894
4895 if x > 2 and y > 0:
4896 c = _get_pixel(basex - stride - 3)
4897
4898 p = a + b - c
4899
4900 pa = abs(p - a)
4901 pb = abs(p - b)
4902 pc = abs(p - c)
4903
4904 if pa <= pb and pa <= pc:
4905 color = (color + a) & 0xff
4906 elif pb <= pc:
4907 color = (color + b) & 0xff
4908 else:
4909 color = (color + c) & 0xff
4910
4911 current_row.append(color)
4912
4913 return width, height, pixels
4914
4915
4916 def write_xattr(path, key, value):
4917 # Windows: Write xattrs to NTFS Alternate Data Streams:
4918 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4919 if compat_os_name == 'nt':
4920 assert ':' not in key
4921 assert os.path.exists(path)
4922
4923 try:
4924 with open(f'{path}:{key}', 'wb') as f:
4925 f.write(value)
4926 except OSError as e:
4927 raise XAttrMetadataError(e.errno, e.strerror)
4928 return
4929
4930 # UNIX Method 1. Use xattrs/pyxattrs modules
4931 from .dependencies import xattr
4932
4933 setxattr = None
4934 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4935 # Unicode arguments are not supported in pyxattr until version 0.5.0
4936 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4937 if version_tuple(xattr.__version__) >= (0, 5, 0):
4938 setxattr = xattr.set
4939 elif xattr:
4940 setxattr = xattr.setxattr
4941
4942 if setxattr:
4943 try:
4944 setxattr(path, key, value)
4945 except OSError as e:
4946 raise XAttrMetadataError(e.errno, e.strerror)
4947 return
4948
4949 # UNIX Method 2. Use setfattr/xattr executables
4950 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4951 else 'xattr' if check_executable('xattr', ['-h']) else None)
4952 if not exe:
4953 raise XAttrUnavailableError(
4954 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4955 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4956
4957 value = value.decode()
4958 try:
4959 _, stderr, returncode = Popen.run(
4960 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4961 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4962 except OSError as e:
4963 raise XAttrMetadataError(e.errno, e.strerror)
4964 if returncode:
4965 raise XAttrMetadataError(returncode, stderr)
4966
4967
4968 def random_birthday(year_field, month_field, day_field):
4969 start_date = datetime.date(1950, 1, 1)
4970 end_date = datetime.date(1995, 12, 31)
4971 offset = random.randint(0, (end_date - start_date).days)
4972 random_date = start_date + datetime.timedelta(offset)
4973 return {
4974 year_field: str(random_date.year),
4975 month_field: str(random_date.month),
4976 day_field: str(random_date.day),
4977 }
4978
4979
4980 # Templates for internet shortcut files, which are plain text files.
4981 DOT_URL_LINK_TEMPLATE = '''\
4982 [InternetShortcut]
4983 URL=%(url)s
4984 '''
4985
4986 DOT_WEBLOC_LINK_TEMPLATE = '''\
4987 <?xml version="1.0" encoding="UTF-8"?>
4988 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4989 <plist version="1.0">
4990 <dict>
4991 \t<key>URL</key>
4992 \t<string>%(url)s</string>
4993 </dict>
4994 </plist>
4995 '''
4996
4997 DOT_DESKTOP_LINK_TEMPLATE = '''\
4998 [Desktop Entry]
4999 Encoding=UTF-8
5000 Name=%(filename)s
5001 Type=Link
5002 URL=%(url)s
5003 Icon=text-html
5004 '''
5005
5006 LINK_TEMPLATES = {
5007 'url': DOT_URL_LINK_TEMPLATE,
5008 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5009 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5010 }
5011
5012
5013 def iri_to_uri(iri):
5014 """
5015 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5016
5017 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5018 """
5019
5020 iri_parts = compat_urllib_parse_urlparse(iri)
5021
5022 if '[' in iri_parts.netloc:
5023 raise ValueError('IPv6 URIs are not, yet, supported.')
5024 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5025
5026 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5027
5028 net_location = ''
5029 if iri_parts.username:
5030 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5031 if iri_parts.password is not None:
5032 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5033 net_location += '@'
5034
5035 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5036 # The 'idna' encoding produces ASCII text.
5037 if iri_parts.port is not None and iri_parts.port != 80:
5038 net_location += ':' + str(iri_parts.port)
5039
5040 return urllib.parse.urlunparse(
5041 (iri_parts.scheme,
5042 net_location,
5043
5044 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5045
5046 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5047 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5048
5049 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5050 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5051
5052 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5053
5054 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5055
5056
5057 def to_high_limit_path(path):
5058 if sys.platform in ['win32', 'cygwin']:
5059 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5060 return '\\\\?\\' + os.path.abspath(path)
5061
5062 return path
5063
5064
5065 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=None):
5066 val = traverse_obj(obj, *variadic(field))
5067 if (not val and val != 0) if ignore is NO_DEFAULT else val in ignore:
5068 return default
5069 return template % (func(val) if func else val)
5070
5071
5072 def clean_podcast_url(url):
5073 return re.sub(r'''(?x)
5074 (?:
5075 (?:
5076 chtbl\.com/track|
5077 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5078 play\.podtrac\.com
5079 )/[^/]+|
5080 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5081 flex\.acast\.com|
5082 pd(?:
5083 cn\.co| # https://podcorn.com/analytics-prefix/
5084 st\.fm # https://podsights.com/docs/
5085 )/e
5086 )/''', '', url)
5087
5088
5089 _HEX_TABLE = '0123456789abcdef'
5090
5091
5092 def random_uuidv4():
5093 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5094
5095
5096 def make_dir(path, to_screen=None):
5097 try:
5098 dn = os.path.dirname(path)
5099 if dn and not os.path.exists(dn):
5100 os.makedirs(dn)
5101 return True
5102 except OSError as err:
5103 if callable(to_screen) is not None:
5104 to_screen('unable to create directory ' + error_to_compat_str(err))
5105 return False
5106
5107
5108 def get_executable_path():
5109 from .update import _get_variant_and_executable_path
5110
5111 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5112
5113
5114 def load_plugins(name, suffix, namespace):
5115 classes = {}
5116 with contextlib.suppress(FileNotFoundError):
5117 plugins_spec = importlib.util.spec_from_file_location(
5118 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5119 plugins = importlib.util.module_from_spec(plugins_spec)
5120 sys.modules[plugins_spec.name] = plugins
5121 plugins_spec.loader.exec_module(plugins)
5122 for name in dir(plugins):
5123 if name in namespace:
5124 continue
5125 if not name.endswith(suffix):
5126 continue
5127 klass = getattr(plugins, name)
5128 classes[name] = namespace[name] = klass
5129 return classes
5130
5131
5132 def traverse_obj(
5133 obj, *path_list, default=None, expected_type=None, get_all=True,
5134 casesense=True, is_user_input=False, traverse_string=False):
5135 ''' Traverse nested list/dict/tuple
5136 @param path_list A list of paths which are checked one by one.
5137 Each path is a list of keys where each key is a:
5138 - None: Do nothing
5139 - string: A dictionary key
5140 - int: An index into a list
5141 - tuple: A list of keys all of which will be traversed
5142 - Ellipsis: Fetch all values in the object
5143 - Function: Takes the key and value as arguments
5144 and returns whether the key matches or not
5145 @param default Default value to return
5146 @param expected_type Only accept final value of this type (Can also be any callable)
5147 @param get_all Return all the values obtained from a path or only the first one
5148 @param casesense Whether to consider dictionary keys as case sensitive
5149 @param is_user_input Whether the keys are generated from user input. If True,
5150 strings are converted to int/slice if necessary
5151 @param traverse_string Whether to traverse inside strings. If True, any
5152 non-compatible object will also be converted into a string
5153 # TODO: Write tests
5154 '''
5155 if not casesense:
5156 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5157 path_list = (map(_lower, variadic(path)) for path in path_list)
5158
5159 def _traverse_obj(obj, path, _current_depth=0):
5160 nonlocal depth
5161 path = tuple(variadic(path))
5162 for i, key in enumerate(path):
5163 if None in (key, obj):
5164 return obj
5165 if isinstance(key, (list, tuple)):
5166 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5167 key = ...
5168 if key is ...:
5169 obj = (obj.values() if isinstance(obj, dict)
5170 else obj if isinstance(obj, (list, tuple, LazyList))
5171 else str(obj) if traverse_string else [])
5172 _current_depth += 1
5173 depth = max(depth, _current_depth)
5174 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5175 elif callable(key):
5176 if isinstance(obj, (list, tuple, LazyList)):
5177 obj = enumerate(obj)
5178 elif isinstance(obj, dict):
5179 obj = obj.items()
5180 else:
5181 if not traverse_string:
5182 return None
5183 obj = str(obj)
5184 _current_depth += 1
5185 depth = max(depth, _current_depth)
5186 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5187 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5188 obj = (obj.get(key) if casesense or (key in obj)
5189 else next((v for k, v in obj.items() if _lower(k) == key), None))
5190 else:
5191 if is_user_input:
5192 key = (int_or_none(key) if ':' not in key
5193 else slice(*map(int_or_none, key.split(':'))))
5194 if key == slice(None):
5195 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5196 if not isinstance(key, (int, slice)):
5197 return None
5198 if not isinstance(obj, (list, tuple, LazyList)):
5199 if not traverse_string:
5200 return None
5201 obj = str(obj)
5202 try:
5203 obj = obj[key]
5204 except IndexError:
5205 return None
5206 return obj
5207
5208 if isinstance(expected_type, type):
5209 type_test = lambda val: val if isinstance(val, expected_type) else None
5210 elif expected_type is not None:
5211 type_test = expected_type
5212 else:
5213 type_test = lambda val: val
5214
5215 for path in path_list:
5216 depth = 0
5217 val = _traverse_obj(obj, path)
5218 if val is not None:
5219 if depth:
5220 for _ in range(depth - 1):
5221 val = itertools.chain.from_iterable(v for v in val if v is not None)
5222 val = [v for v in map(type_test, val) if v is not None]
5223 if val:
5224 return val if get_all else val[0]
5225 else:
5226 val = type_test(val)
5227 if val is not None:
5228 return val
5229 return default
5230
5231
5232 def traverse_dict(dictn, keys, casesense=True):
5233 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5234 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5235 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5236
5237
5238 def get_first(obj, keys, **kwargs):
5239 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5240
5241
5242 def variadic(x, allowed_types=(str, bytes, dict)):
5243 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5244
5245
5246 def decode_base(value, digits):
5247 # This will convert given base-x string to scalar (long or int)
5248 table = {char: index for index, char in enumerate(digits)}
5249 result = 0
5250 base = len(digits)
5251 for chr in value:
5252 result *= base
5253 result += table[chr]
5254 return result
5255
5256
5257 def time_seconds(**kwargs):
5258 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5259 return t.timestamp()
5260
5261
5262 # create a JSON Web Signature (jws) with HS256 algorithm
5263 # the resulting format is in JWS Compact Serialization
5264 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5265 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5266 def jwt_encode_hs256(payload_data, key, headers={}):
5267 header_data = {
5268 'alg': 'HS256',
5269 'typ': 'JWT',
5270 }
5271 if headers:
5272 header_data.update(headers)
5273 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5274 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5275 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5276 signature_b64 = base64.b64encode(h.digest())
5277 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5278 return token
5279
5280
5281 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5282 def jwt_decode_hs256(jwt):
5283 header_b64, payload_b64, signature_b64 = jwt.split('.')
5284 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5285 return payload_data
5286
5287
5288 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5289
5290
5291 @functools.cache
5292 def supports_terminal_sequences(stream):
5293 if compat_os_name == 'nt':
5294 if not WINDOWS_VT_MODE:
5295 return False
5296 elif not os.getenv('TERM'):
5297 return False
5298 try:
5299 return stream.isatty()
5300 except BaseException:
5301 return False
5302
5303
5304 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5305 if get_windows_version() < (10, 0, 10586):
5306 return
5307 global WINDOWS_VT_MODE
5308 try:
5309 Popen.run('', shell=True)
5310 except Exception:
5311 return
5312
5313 WINDOWS_VT_MODE = True
5314 supports_terminal_sequences.cache_clear()
5315
5316
5317 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5318
5319
5320 def remove_terminal_sequences(string):
5321 return _terminal_sequences_re.sub('', string)
5322
5323
5324 def number_of_digits(number):
5325 return len('%d' % number)
5326
5327
5328 def join_nonempty(*values, delim='-', from_dict=None):
5329 if from_dict is not None:
5330 values = map(from_dict.get, values)
5331 return delim.join(map(str, filter(None, values)))
5332
5333
5334 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5335 """
5336 Find the largest format dimensions in terms of video width and, for each thumbnail:
5337 * Modify the URL: Match the width with the provided regex and replace with the former width
5338 * Update dimensions
5339
5340 This function is useful with video services that scale the provided thumbnails on demand
5341 """
5342 _keys = ('width', 'height')
5343 max_dimensions = max(
5344 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5345 default=(0, 0))
5346 if not max_dimensions[0]:
5347 return thumbnails
5348 return [
5349 merge_dicts(
5350 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5351 dict(zip(_keys, max_dimensions)), thumbnail)
5352 for thumbnail in thumbnails
5353 ]
5354
5355
5356 def parse_http_range(range):
5357 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5358 if not range:
5359 return None, None, None
5360 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5361 if not crg:
5362 return None, None, None
5363 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5364
5365
5366 def read_stdin(what):
5367 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5368 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5369 return sys.stdin
5370
5371
5372 class Config:
5373 own_args = None
5374 parsed_args = None
5375 filename = None
5376 __initialized = False
5377
5378 def __init__(self, parser, label=None):
5379 self.parser, self.label = parser, label
5380 self._loaded_paths, self.configs = set(), []
5381
5382 def init(self, args=None, filename=None):
5383 assert not self.__initialized
5384 directory = ''
5385 if filename:
5386 location = os.path.realpath(filename)
5387 directory = os.path.dirname(location)
5388 if location in self._loaded_paths:
5389 return False
5390 self._loaded_paths.add(location)
5391
5392 self.own_args, self.__initialized = args, True
5393 opts, _ = self.parser.parse_known_args(args)
5394 self.parsed_args, self.filename = args, filename
5395
5396 for location in opts.config_locations or []:
5397 if location == '-':
5398 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5399 continue
5400 location = os.path.join(directory, expand_path(location))
5401 if os.path.isdir(location):
5402 location = os.path.join(location, 'yt-dlp.conf')
5403 if not os.path.exists(location):
5404 self.parser.error(f'config location {location} does not exist')
5405 self.append_config(self.read_file(location), location)
5406 return True
5407
5408 def __str__(self):
5409 label = join_nonempty(
5410 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5411 delim=' ')
5412 return join_nonempty(
5413 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5414 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5415 delim='\n')
5416
5417 @staticmethod
5418 def read_file(filename, default=[]):
5419 try:
5420 optionf = open(filename)
5421 except OSError:
5422 return default # silently skip if file is not present
5423 try:
5424 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5425 contents = optionf.read()
5426 res = shlex.split(contents, comments=True)
5427 except Exception as err:
5428 raise ValueError(f'Unable to parse "{filename}": {err}')
5429 finally:
5430 optionf.close()
5431 return res
5432
5433 @staticmethod
5434 def hide_login_info(opts):
5435 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5436 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5437
5438 def _scrub_eq(o):
5439 m = eqre.match(o)
5440 if m:
5441 return m.group('key') + '=PRIVATE'
5442 else:
5443 return o
5444
5445 opts = list(map(_scrub_eq, opts))
5446 for idx, opt in enumerate(opts):
5447 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5448 opts[idx + 1] = 'PRIVATE'
5449 return opts
5450
5451 def append_config(self, *args, label=None):
5452 config = type(self)(self.parser, label)
5453 config._loaded_paths = self._loaded_paths
5454 if config.init(*args):
5455 self.configs.append(config)
5456
5457 @property
5458 def all_args(self):
5459 for config in reversed(self.configs):
5460 yield from config.all_args
5461 yield from self.parsed_args or []
5462
5463 def parse_known_args(self, **kwargs):
5464 return self.parser.parse_known_args(self.all_args, **kwargs)
5465
5466 def parse_args(self):
5467 return self.parser.parse_args(self.all_args)
5468
5469
5470 class WebSocketsWrapper():
5471 """Wraps websockets module to use in non-async scopes"""
5472 pool = None
5473
5474 def __init__(self, url, headers=None, connect=True):
5475 self.loop = asyncio.new_event_loop()
5476 # XXX: "loop" is deprecated
5477 self.conn = websockets.connect(
5478 url, extra_headers=headers, ping_interval=None,
5479 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5480 if connect:
5481 self.__enter__()
5482 atexit.register(self.__exit__, None, None, None)
5483
5484 def __enter__(self):
5485 if not self.pool:
5486 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5487 return self
5488
5489 def send(self, *args):
5490 self.run_with_loop(self.pool.send(*args), self.loop)
5491
5492 def recv(self, *args):
5493 return self.run_with_loop(self.pool.recv(*args), self.loop)
5494
5495 def __exit__(self, type, value, traceback):
5496 try:
5497 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5498 finally:
5499 self.loop.close()
5500 self._cancel_all_tasks(self.loop)
5501
5502 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5503 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5504 @staticmethod
5505 def run_with_loop(main, loop):
5506 if not asyncio.iscoroutine(main):
5507 raise ValueError(f'a coroutine was expected, got {main!r}')
5508
5509 try:
5510 return loop.run_until_complete(main)
5511 finally:
5512 loop.run_until_complete(loop.shutdown_asyncgens())
5513 if hasattr(loop, 'shutdown_default_executor'):
5514 loop.run_until_complete(loop.shutdown_default_executor())
5515
5516 @staticmethod
5517 def _cancel_all_tasks(loop):
5518 to_cancel = asyncio.all_tasks(loop)
5519
5520 if not to_cancel:
5521 return
5522
5523 for task in to_cancel:
5524 task.cancel()
5525
5526 # XXX: "loop" is removed in python 3.10+
5527 loop.run_until_complete(
5528 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5529
5530 for task in to_cancel:
5531 if task.cancelled():
5532 continue
5533 if task.exception() is not None:
5534 loop.call_exception_handler({
5535 'message': 'unhandled exception during asyncio.run() shutdown',
5536 'exception': task.exception(),
5537 'task': task,
5538 })
5539
5540
5541 def merge_headers(*dicts):
5542 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5543 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5544
5545
5546 class classproperty:
5547 """classmethod(property(func)) that works in py < 3.9"""
5548
5549 def __init__(self, func):
5550 functools.update_wrapper(self, func)
5551 self.func = func
5552
5553 def __get__(self, _, cls):
5554 return self.func(cls)
5555
5556
5557 class Namespace(types.SimpleNamespace):
5558 """Immutable namespace"""
5559
5560 def __iter__(self):
5561 return iter(self.__dict__.values())
5562
5563 @property
5564 def items_(self):
5565 return self.__dict__.items()
5566
5567
5568 # Deprecated
5569 has_certifi = bool(certifi)
5570 has_websockets = bool(websockets)