]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
Add slicing notation to `--playlist-items`
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import ctypes
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import importlib.util
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import mimetypes
24 import operator
25 import os
26 import platform
27 import random
28 import re
29 import shlex
30 import socket
31 import ssl
32 import subprocess
33 import sys
34 import tempfile
35 import time
36 import traceback
37 import types
38 import urllib.parse
39 import xml.etree.ElementTree
40 import zlib
41
42 from .compat import asyncio, functools # isort: split
43 from .compat import (
44 compat_chr,
45 compat_cookiejar,
46 compat_etree_fromstring,
47 compat_expanduser,
48 compat_html_entities,
49 compat_html_entities_html5,
50 compat_HTMLParseError,
51 compat_HTMLParser,
52 compat_http_client,
53 compat_HTTPError,
54 compat_os_name,
55 compat_parse_qs,
56 compat_shlex_quote,
57 compat_str,
58 compat_struct_pack,
59 compat_struct_unpack,
60 compat_urllib_error,
61 compat_urllib_parse_unquote_plus,
62 compat_urllib_parse_urlencode,
63 compat_urllib_parse_urlparse,
64 compat_urllib_request,
65 compat_urlparse,
66 )
67 from .dependencies import brotli, certifi, websockets
68 from .socks import ProxyType, sockssocket
69
70
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
80 # This is not clearly defined otherwise
81 compiled_regex_type = type(re.compile(''))
82
83
84 def random_user_agent():
85 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
86 _CHROME_VERSIONS = (
87 '90.0.4430.212',
88 '90.0.4430.24',
89 '90.0.4430.70',
90 '90.0.4430.72',
91 '90.0.4430.85',
92 '90.0.4430.93',
93 '91.0.4472.101',
94 '91.0.4472.106',
95 '91.0.4472.114',
96 '91.0.4472.124',
97 '91.0.4472.164',
98 '91.0.4472.19',
99 '91.0.4472.77',
100 '92.0.4515.107',
101 '92.0.4515.115',
102 '92.0.4515.131',
103 '92.0.4515.159',
104 '92.0.4515.43',
105 '93.0.4556.0',
106 '93.0.4577.15',
107 '93.0.4577.63',
108 '93.0.4577.82',
109 '94.0.4606.41',
110 '94.0.4606.54',
111 '94.0.4606.61',
112 '94.0.4606.71',
113 '94.0.4606.81',
114 '94.0.4606.85',
115 '95.0.4638.17',
116 '95.0.4638.50',
117 '95.0.4638.54',
118 '95.0.4638.69',
119 '95.0.4638.74',
120 '96.0.4664.18',
121 '96.0.4664.45',
122 '96.0.4664.55',
123 '96.0.4664.93',
124 '97.0.4692.20',
125 )
126 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
127
128
129 SUPPORTED_ENCODINGS = [
130 'gzip', 'deflate'
131 ]
132 if brotli:
133 SUPPORTED_ENCODINGS.append('br')
134
135 std_headers = {
136 'User-Agent': random_user_agent(),
137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
138 'Accept-Language': 'en-us,en;q=0.5',
139 'Sec-Fetch-Mode': 'navigate',
140 }
141
142
143 USER_AGENTS = {
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
145 }
146
147
148 NO_DEFAULT = object()
149
150 ENGLISH_MONTH_NAMES = [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
153
154 MONTH_NAMES = {
155 'en': ENGLISH_MONTH_NAMES,
156 'fr': [
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
159 }
160
161 KNOWN_EXTENSIONS = (
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
166 'avi', 'divx',
167 'mov',
168 'asf', 'wmv', 'wma',
169 '3gp', '3g2',
170 'mp3',
171 'flac',
172 'ape',
173 'wav',
174 'f4f', 'f4m', 'm3u8', 'smil')
175
176 # needed for sanitizing filenames in restricted mode
177 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
178 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
180
181 DATE_FORMATS = (
182 '%d %B %Y',
183 '%d %b %Y',
184 '%B %d %Y',
185 '%B %dst %Y',
186 '%B %dnd %Y',
187 '%B %drd %Y',
188 '%B %dth %Y',
189 '%b %d %Y',
190 '%b %dst %Y',
191 '%b %dnd %Y',
192 '%b %drd %Y',
193 '%b %dth %Y',
194 '%b %dst %Y %I:%M',
195 '%b %dnd %Y %I:%M',
196 '%b %drd %Y %I:%M',
197 '%b %dth %Y %I:%M',
198 '%Y %m %d',
199 '%Y-%m-%d',
200 '%Y.%m.%d.',
201 '%Y/%m/%d',
202 '%Y/%m/%d %H:%M',
203 '%Y/%m/%d %H:%M:%S',
204 '%Y%m%d%H%M',
205 '%Y%m%d%H%M%S',
206 '%Y%m%d',
207 '%Y-%m-%d %H:%M',
208 '%Y-%m-%d %H:%M:%S',
209 '%Y-%m-%d %H:%M:%S.%f',
210 '%Y-%m-%d %H:%M:%S:%f',
211 '%d.%m.%Y %H:%M',
212 '%d.%m.%Y %H.%M',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S',
217 '%Y-%m-%dT%H:%M:%S.%f',
218 '%Y-%m-%dT%H:%M',
219 '%b %d %Y at %H:%M',
220 '%b %d %Y at %H:%M:%S',
221 '%B %d %Y at %H:%M',
222 '%B %d %Y at %H:%M:%S',
223 '%H:%M %d-%b-%Y',
224 )
225
226 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
227 DATE_FORMATS_DAY_FIRST.extend([
228 '%d-%m-%Y',
229 '%d.%m.%Y',
230 '%d.%m.%y',
231 '%d/%m/%Y',
232 '%d/%m/%y',
233 '%d/%m/%Y %H:%M:%S',
234 ])
235
236 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
237 DATE_FORMATS_MONTH_FIRST.extend([
238 '%m-%d-%Y',
239 '%m.%d.%Y',
240 '%m/%d/%Y',
241 '%m/%d/%y',
242 '%m/%d/%Y %H:%M:%S',
243 ])
244
245 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
246 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
247
248 NUMBER_RE = r'\d+(?:\.\d+)?'
249
250
251 @functools.cache
252 def preferredencoding():
253 """Get preferred encoding.
254
255 Returns the best encoding scheme for the system, based on
256 locale.getpreferredencoding() and some further tweaks.
257 """
258 try:
259 pref = locale.getpreferredencoding()
260 'TEST'.encode(pref)
261 except Exception:
262 pref = 'UTF-8'
263
264 return pref
265
266
267 def write_json_file(obj, fn):
268 """ Encode obj as JSON and write it to fn, atomically if possible """
269
270 tf = tempfile.NamedTemporaryFile(
271 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
272 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
273
274 try:
275 with tf:
276 json.dump(obj, tf, ensure_ascii=False)
277 if sys.platform == 'win32':
278 # Need to remove existing file on Windows, else os.rename raises
279 # WindowsError or FileExistsError.
280 with contextlib.suppress(OSError):
281 os.unlink(fn)
282 with contextlib.suppress(OSError):
283 mask = os.umask(0)
284 os.umask(mask)
285 os.chmod(tf.name, 0o666 & ~mask)
286 os.rename(tf.name, fn)
287 except Exception:
288 with contextlib.suppress(OSError):
289 os.remove(tf.name)
290 raise
291
292
293 def find_xpath_attr(node, xpath, key, val=None):
294 """ Find the xpath xpath[@key=val] """
295 assert re.match(r'^[a-zA-Z_-]+$', key)
296 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
297 return node.find(expr)
298
299 # On python2.6 the xml.etree.ElementTree.Element methods don't support
300 # the namespace parameter
301
302
303 def xpath_with_ns(path, ns_map):
304 components = [c.split(':') for c in path.split('/')]
305 replaced = []
306 for c in components:
307 if len(c) == 1:
308 replaced.append(c[0])
309 else:
310 ns, tag = c
311 replaced.append('{%s}%s' % (ns_map[ns], tag))
312 return '/'.join(replaced)
313
314
315 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
316 def _find_xpath(xpath):
317 return node.find(xpath)
318
319 if isinstance(xpath, (str, compat_str)):
320 n = _find_xpath(xpath)
321 else:
322 for xp in xpath:
323 n = _find_xpath(xp)
324 if n is not None:
325 break
326
327 if n is None:
328 if default is not NO_DEFAULT:
329 return default
330 elif fatal:
331 name = xpath if name is None else name
332 raise ExtractorError('Could not find XML element %s' % name)
333 else:
334 return None
335 return n
336
337
338 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
339 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
340 if n is None or n == default:
341 return n
342 if n.text is None:
343 if default is not NO_DEFAULT:
344 return default
345 elif fatal:
346 name = xpath if name is None else name
347 raise ExtractorError('Could not find XML element\'s text %s' % name)
348 else:
349 return None
350 return n.text
351
352
353 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
354 n = find_xpath_attr(node, xpath, key)
355 if n is None:
356 if default is not NO_DEFAULT:
357 return default
358 elif fatal:
359 name = f'{xpath}[@{key}]' if name is None else name
360 raise ExtractorError('Could not find XML attribute %s' % name)
361 else:
362 return None
363 return n.attrib[key]
364
365
366 def get_element_by_id(id, html, **kwargs):
367 """Return the content of the tag with the specified ID in the passed HTML document"""
368 return get_element_by_attribute('id', id, html, **kwargs)
369
370
371 def get_element_html_by_id(id, html, **kwargs):
372 """Return the html of the tag with the specified ID in the passed HTML document"""
373 return get_element_html_by_attribute('id', id, html, **kwargs)
374
375
376 def get_element_by_class(class_name, html):
377 """Return the content of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
382 def get_element_html_by_class(class_name, html):
383 """Return the html of the first tag with the specified class in the passed HTML document"""
384 retval = get_elements_html_by_class(class_name, html)
385 return retval[0] if retval else None
386
387
388 def get_element_by_attribute(attribute, value, html, **kwargs):
389 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
390 return retval[0] if retval else None
391
392
393 def get_element_html_by_attribute(attribute, value, html, **kargs):
394 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
395 return retval[0] if retval else None
396
397
398 def get_elements_by_class(class_name, html, **kargs):
399 """Return the content of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_by_attribute(
401 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
402 html, escape_value=False)
403
404
405 def get_elements_html_by_class(class_name, html):
406 """Return the html of all tags with the specified class in the passed HTML document as a list"""
407 return get_elements_html_by_attribute(
408 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
409 html, escape_value=False)
410
411
412 def get_elements_by_attribute(*args, **kwargs):
413 """Return the content of the tag with the specified attribute in the passed HTML document"""
414 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
415
416
417 def get_elements_html_by_attribute(*args, **kwargs):
418 """Return the html of the tag with the specified attribute in the passed HTML document"""
419 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
420
421
422 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
423 """
424 Return the text (content) and the html (whole) of the tag with the specified
425 attribute in the passed HTML document
426 """
427
428 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
429
430 value = re.escape(value) if escape_value else value
431
432 partial_element_re = rf'''(?x)
433 <(?P<tag>[a-zA-Z0-9:._-]+)
434 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
435 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
436 '''
437
438 for m in re.finditer(partial_element_re, html):
439 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
440
441 yield (
442 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
443 whole
444 )
445
446
447 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
448 """
449 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
450 closing tag for the first opening tag it has encountered, and can be used
451 as a context manager
452 """
453
454 class HTMLBreakOnClosingTagException(Exception):
455 pass
456
457 def __init__(self):
458 self.tagstack = collections.deque()
459 compat_HTMLParser.__init__(self)
460
461 def __enter__(self):
462 return self
463
464 def __exit__(self, *_):
465 self.close()
466
467 def close(self):
468 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
469 # so data remains buffered; we no longer have any interest in it, thus
470 # override this method to discard it
471 pass
472
473 def handle_starttag(self, tag, _):
474 self.tagstack.append(tag)
475
476 def handle_endtag(self, tag):
477 if not self.tagstack:
478 raise compat_HTMLParseError('no tags in the stack')
479 while self.tagstack:
480 inner_tag = self.tagstack.pop()
481 if inner_tag == tag:
482 break
483 else:
484 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
485 if not self.tagstack:
486 raise self.HTMLBreakOnClosingTagException()
487
488
489 def get_element_text_and_html_by_tag(tag, html):
490 """
491 For the first element with the specified tag in the passed HTML document
492 return its' content (text) and the whole element (html)
493 """
494 def find_or_raise(haystack, needle, exc):
495 try:
496 return haystack.index(needle)
497 except ValueError:
498 raise exc
499 closing_tag = f'</{tag}>'
500 whole_start = find_or_raise(
501 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
502 content_start = find_or_raise(
503 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
504 content_start += whole_start + 1
505 with HTMLBreakOnClosingTagParser() as parser:
506 parser.feed(html[whole_start:content_start])
507 if not parser.tagstack or parser.tagstack[0] != tag:
508 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
509 offset = content_start
510 while offset < len(html):
511 next_closing_tag_start = find_or_raise(
512 html[offset:], closing_tag,
513 compat_HTMLParseError(f'closing {tag} tag not found'))
514 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
515 try:
516 parser.feed(html[offset:offset + next_closing_tag_end])
517 offset += next_closing_tag_end
518 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
519 return html[content_start:offset + next_closing_tag_start], \
520 html[whole_start:offset + next_closing_tag_end]
521 raise compat_HTMLParseError('unexpected end of html')
522
523
524 class HTMLAttributeParser(compat_HTMLParser):
525 """Trivial HTML parser to gather the attributes for a single element"""
526
527 def __init__(self):
528 self.attrs = {}
529 compat_HTMLParser.__init__(self)
530
531 def handle_starttag(self, tag, attrs):
532 self.attrs = dict(attrs)
533
534
535 class HTMLListAttrsParser(compat_HTMLParser):
536 """HTML parser to gather the attributes for the elements of a list"""
537
538 def __init__(self):
539 compat_HTMLParser.__init__(self)
540 self.items = []
541 self._level = 0
542
543 def handle_starttag(self, tag, attrs):
544 if tag == 'li' and self._level == 0:
545 self.items.append(dict(attrs))
546 self._level += 1
547
548 def handle_endtag(self, tag):
549 self._level -= 1
550
551
552 def extract_attributes(html_element):
553 """Given a string for an HTML element such as
554 <el
555 a="foo" B="bar" c="&98;az" d=boz
556 empty= noval entity="&amp;"
557 sq='"' dq="'"
558 >
559 Decode and return a dictionary of attributes.
560 {
561 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
562 'empty': '', 'noval': None, 'entity': '&',
563 'sq': '"', 'dq': '\''
564 }.
565 """
566 parser = HTMLAttributeParser()
567 with contextlib.suppress(compat_HTMLParseError):
568 parser.feed(html_element)
569 parser.close()
570 return parser.attrs
571
572
573 def parse_list(webpage):
574 """Given a string for an series of HTML <li> elements,
575 return a dictionary of their attributes"""
576 parser = HTMLListAttrsParser()
577 parser.feed(webpage)
578 parser.close()
579 return parser.items
580
581
582 def clean_html(html):
583 """Clean an HTML snippet into a readable string"""
584
585 if html is None: # Convenience for sanitizing descriptions etc.
586 return html
587
588 html = re.sub(r'\s+', ' ', html)
589 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
590 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
591 # Strip html tags
592 html = re.sub('<.*?>', '', html)
593 # Replace html entities
594 html = unescapeHTML(html)
595 return html.strip()
596
597
598 class LenientJSONDecoder(json.JSONDecoder):
599 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
600 self.transform_source, self.ignore_extra = transform_source, ignore_extra
601 super().__init__(*args, **kwargs)
602
603 def decode(self, s):
604 if self.transform_source:
605 s = self.transform_source(s)
606 if self.ignore_extra:
607 return self.raw_decode(s.lstrip())[0]
608 return super().decode(s)
609
610
611 def sanitize_open(filename, open_mode):
612 """Try to open the given filename, and slightly tweak it if this fails.
613
614 Attempts to open the given filename. If this fails, it tries to change
615 the filename slightly, step by step, until it's either able to open it
616 or it fails and raises a final exception, like the standard open()
617 function.
618
619 It returns the tuple (stream, definitive_file_name).
620 """
621 if filename == '-':
622 if sys.platform == 'win32':
623 import msvcrt
624 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
625 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
626
627 for attempt in range(2):
628 try:
629 try:
630 if sys.platform == 'win32':
631 # FIXME: An exclusive lock also locks the file from being read.
632 # Since windows locks are mandatory, don't lock the file on windows (for now).
633 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
634 raise LockingUnsupportedError()
635 stream = locked_file(filename, open_mode, block=False).__enter__()
636 except OSError:
637 stream = open(filename, open_mode)
638 return stream, filename
639 except OSError as err:
640 if attempt or err.errno in (errno.EACCES,):
641 raise
642 old_filename, filename = filename, sanitize_path(filename)
643 if old_filename == filename:
644 raise
645
646
647 def timeconvert(timestr):
648 """Convert RFC 2822 defined time string into system timestamp"""
649 timestamp = None
650 timetuple = email.utils.parsedate_tz(timestr)
651 if timetuple is not None:
652 timestamp = email.utils.mktime_tz(timetuple)
653 return timestamp
654
655
656 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
657 """Sanitizes a string so it could be used as part of a filename.
658 @param restricted Use a stricter subset of allowed characters
659 @param is_id Whether this is an ID that should be kept unchanged if possible.
660 If unset, yt-dlp's new sanitization rules are in effect
661 """
662 if s == '':
663 return ''
664
665 def replace_insane(char):
666 if restricted and char in ACCENT_CHARS:
667 return ACCENT_CHARS[char]
668 elif not restricted and char == '\n':
669 return '\0 '
670 elif char == '?' or ord(char) < 32 or ord(char) == 127:
671 return ''
672 elif char == '"':
673 return '' if restricted else '\''
674 elif char == ':':
675 return '\0_\0-' if restricted else '\0 \0-'
676 elif char in '\\/|*<>':
677 return '\0_'
678 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
679 return '\0_'
680 return char
681
682 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
683 result = ''.join(map(replace_insane, s))
684 if is_id is NO_DEFAULT:
685 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
686 STRIP_RE = '(?:\0.|[ _-])*'
687 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
688 result = result.replace('\0', '') or '_'
689
690 if not is_id:
691 while '__' in result:
692 result = result.replace('__', '_')
693 result = result.strip('_')
694 # Common case of "Foreign band name - English song title"
695 if restricted and result.startswith('-_'):
696 result = result[2:]
697 if result.startswith('-'):
698 result = '_' + result[len('-'):]
699 result = result.lstrip('.')
700 if not result:
701 result = '_'
702 return result
703
704
705 def sanitize_path(s, force=False):
706 """Sanitizes and normalizes path on Windows"""
707 if sys.platform == 'win32':
708 force = False
709 drive_or_unc, _ = os.path.splitdrive(s)
710 elif force:
711 drive_or_unc = ''
712 else:
713 return s
714
715 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
716 if drive_or_unc:
717 norm_path.pop(0)
718 sanitized_path = [
719 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
720 for path_part in norm_path]
721 if drive_or_unc:
722 sanitized_path.insert(0, drive_or_unc + os.path.sep)
723 elif force and s and s[0] == os.path.sep:
724 sanitized_path.insert(0, os.path.sep)
725 return os.path.join(*sanitized_path)
726
727
728 def sanitize_url(url):
729 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
730 # the number of unwanted failures due to missing protocol
731 if url is None:
732 return
733 elif url.startswith('//'):
734 return 'http:%s' % url
735 # Fix some common typos seen so far
736 COMMON_TYPOS = (
737 # https://github.com/ytdl-org/youtube-dl/issues/15649
738 (r'^httpss://', r'https://'),
739 # https://bx1.be/lives/direct-tv/
740 (r'^rmtp([es]?)://', r'rtmp\1://'),
741 )
742 for mistake, fixup in COMMON_TYPOS:
743 if re.match(mistake, url):
744 return re.sub(mistake, fixup, url)
745 return url
746
747
748 def extract_basic_auth(url):
749 parts = compat_urlparse.urlsplit(url)
750 if parts.username is None:
751 return url, None
752 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
753 parts.hostname if parts.port is None
754 else '%s:%d' % (parts.hostname, parts.port))))
755 auth_payload = base64.b64encode(
756 ('%s:%s' % (parts.username, parts.password or '')).encode())
757 return url, f'Basic {auth_payload.decode()}'
758
759
760 def sanitized_Request(url, *args, **kwargs):
761 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
762 if auth_header is not None:
763 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
764 headers['Authorization'] = auth_header
765 return compat_urllib_request.Request(url, *args, **kwargs)
766
767
768 def expand_path(s):
769 """Expand shell variables and ~"""
770 return os.path.expandvars(compat_expanduser(s))
771
772
773 def orderedSet(iterable):
774 """ Remove all duplicates from the input iterable """
775 res = []
776 for el in iterable:
777 if el not in res:
778 res.append(el)
779 return res
780
781
782 def _htmlentity_transform(entity_with_semicolon):
783 """Transforms an HTML entity to a character."""
784 entity = entity_with_semicolon[:-1]
785
786 # Known non-numeric HTML entity
787 if entity in compat_html_entities.name2codepoint:
788 return compat_chr(compat_html_entities.name2codepoint[entity])
789
790 # TODO: HTML5 allows entities without a semicolon. For example,
791 # '&Eacuteric' should be decoded as 'Éric'.
792 if entity_with_semicolon in compat_html_entities_html5:
793 return compat_html_entities_html5[entity_with_semicolon]
794
795 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
796 if mobj is not None:
797 numstr = mobj.group(1)
798 if numstr.startswith('x'):
799 base = 16
800 numstr = '0%s' % numstr
801 else:
802 base = 10
803 # See https://github.com/ytdl-org/youtube-dl/issues/7518
804 with contextlib.suppress(ValueError):
805 return compat_chr(int(numstr, base))
806
807 # Unknown entity in name, return its literal representation
808 return '&%s;' % entity
809
810
811 def unescapeHTML(s):
812 if s is None:
813 return None
814 assert isinstance(s, str)
815
816 return re.sub(
817 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
818
819
820 def escapeHTML(text):
821 return (
822 text
823 .replace('&', '&amp;')
824 .replace('<', '&lt;')
825 .replace('>', '&gt;')
826 .replace('"', '&quot;')
827 .replace("'", '&#39;')
828 )
829
830
831 def process_communicate_or_kill(p, *args, **kwargs):
832 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
833 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
834 return Popen.communicate_or_kill(p, *args, **kwargs)
835
836
837 class Popen(subprocess.Popen):
838 if sys.platform == 'win32':
839 _startupinfo = subprocess.STARTUPINFO()
840 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
841 else:
842 _startupinfo = None
843
844 def __init__(self, *args, text=False, **kwargs):
845 if text is True:
846 kwargs['universal_newlines'] = True # For 3.6 compatibility
847 kwargs.setdefault('encoding', 'utf-8')
848 kwargs.setdefault('errors', 'replace')
849 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
850
851 def communicate_or_kill(self, *args, **kwargs):
852 try:
853 return self.communicate(*args, **kwargs)
854 except BaseException: # Including KeyboardInterrupt
855 self.kill(timeout=None)
856 raise
857
858 def kill(self, *, timeout=0):
859 super().kill()
860 if timeout != 0:
861 self.wait(timeout=timeout)
862
863 @classmethod
864 def run(cls, *args, **kwargs):
865 with cls(*args, **kwargs) as proc:
866 stdout, stderr = proc.communicate_or_kill()
867 return stdout or '', stderr or '', proc.returncode
868
869
870 def get_subprocess_encoding():
871 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
872 # For subprocess calls, encode with locale encoding
873 # Refer to http://stackoverflow.com/a/9951851/35070
874 encoding = preferredencoding()
875 else:
876 encoding = sys.getfilesystemencoding()
877 if encoding is None:
878 encoding = 'utf-8'
879 return encoding
880
881
882 def encodeFilename(s, for_subprocess=False):
883 assert isinstance(s, str)
884 return s
885
886
887 def decodeFilename(b, for_subprocess=False):
888 return b
889
890
891 def encodeArgument(s):
892 # Legacy code that uses byte strings
893 # Uncomment the following line after fixing all post processors
894 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
895 return s if isinstance(s, str) else s.decode('ascii')
896
897
898 def decodeArgument(b):
899 return b
900
901
902 def decodeOption(optval):
903 if optval is None:
904 return optval
905 if isinstance(optval, bytes):
906 optval = optval.decode(preferredencoding())
907
908 assert isinstance(optval, compat_str)
909 return optval
910
911
912 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
913
914
915 def timetuple_from_msec(msec):
916 secs, msec = divmod(msec, 1000)
917 mins, secs = divmod(secs, 60)
918 hrs, mins = divmod(mins, 60)
919 return _timetuple(hrs, mins, secs, msec)
920
921
922 def formatSeconds(secs, delim=':', msec=False):
923 time = timetuple_from_msec(secs * 1000)
924 if time.hours:
925 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
926 elif time.minutes:
927 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
928 else:
929 ret = '%d' % time.seconds
930 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
931
932
933 def _ssl_load_windows_store_certs(ssl_context, storename):
934 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
935 try:
936 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
937 if encoding == 'x509_asn' and (
938 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
939 except PermissionError:
940 return
941 for cert in certs:
942 with contextlib.suppress(ssl.SSLError):
943 ssl_context.load_verify_locations(cadata=cert)
944
945
946 def make_HTTPS_handler(params, **kwargs):
947 opts_check_certificate = not params.get('nocheckcertificate')
948 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
949 context.check_hostname = opts_check_certificate
950 if params.get('legacyserverconnect'):
951 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
952 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
953 context.set_ciphers('DEFAULT')
954
955 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
956 if opts_check_certificate:
957 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
958 context.load_verify_locations(cafile=certifi.where())
959 try:
960 context.load_default_certs()
961 # Work around the issue in load_default_certs when there are bad certificates. See:
962 # https://github.com/yt-dlp/yt-dlp/issues/1060,
963 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
964 except ssl.SSLError:
965 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
966 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
967 for storename in ('CA', 'ROOT'):
968 _ssl_load_windows_store_certs(context, storename)
969 context.set_default_verify_paths()
970
971 client_certfile = params.get('client_certificate')
972 if client_certfile:
973 try:
974 context.load_cert_chain(
975 client_certfile, keyfile=params.get('client_certificate_key'),
976 password=params.get('client_certificate_password'))
977 except ssl.SSLError:
978 raise YoutubeDLError('Unable to load client certificate')
979
980 # Some servers may reject requests if ALPN extension is not sent. See:
981 # https://github.com/python/cpython/issues/85140
982 # https://github.com/yt-dlp/yt-dlp/issues/3878
983 with contextlib.suppress(NotImplementedError):
984 context.set_alpn_protocols(['http/1.1'])
985
986 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
987
988
989 def bug_reports_message(before=';'):
990 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
991 'filling out the appropriate issue template. '
992 'Confirm you are on the latest version using yt-dlp -U')
993
994 before = before.rstrip()
995 if not before or before.endswith(('.', '!', '?')):
996 msg = msg[0].title() + msg[1:]
997
998 return (before + ' ' if before else '') + msg
999
1000
1001 class YoutubeDLError(Exception):
1002 """Base exception for YoutubeDL errors."""
1003 msg = None
1004
1005 def __init__(self, msg=None):
1006 if msg is not None:
1007 self.msg = msg
1008 elif self.msg is None:
1009 self.msg = type(self).__name__
1010 super().__init__(self.msg)
1011
1012
1013 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1014 if hasattr(ssl, 'CertificateError'):
1015 network_exceptions.append(ssl.CertificateError)
1016 network_exceptions = tuple(network_exceptions)
1017
1018
1019 class ExtractorError(YoutubeDLError):
1020 """Error during info extraction."""
1021
1022 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1023 """ tb, if given, is the original traceback (so that it can be printed out).
1024 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1025 """
1026 if sys.exc_info()[0] in network_exceptions:
1027 expected = True
1028
1029 self.orig_msg = str(msg)
1030 self.traceback = tb
1031 self.expected = expected
1032 self.cause = cause
1033 self.video_id = video_id
1034 self.ie = ie
1035 self.exc_info = sys.exc_info() # preserve original exception
1036
1037 super().__init__(''.join((
1038 format_field(ie, template='[%s] '),
1039 format_field(video_id, template='%s: '),
1040 msg,
1041 format_field(cause, template=' (caused by %r)'),
1042 '' if expected else bug_reports_message())))
1043
1044 def format_traceback(self):
1045 return join_nonempty(
1046 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1047 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1048 delim='\n') or None
1049
1050
1051 class UnsupportedError(ExtractorError):
1052 def __init__(self, url):
1053 super().__init__(
1054 'Unsupported URL: %s' % url, expected=True)
1055 self.url = url
1056
1057
1058 class RegexNotFoundError(ExtractorError):
1059 """Error when a regex didn't match"""
1060 pass
1061
1062
1063 class GeoRestrictedError(ExtractorError):
1064 """Geographic restriction Error exception.
1065
1066 This exception may be thrown when a video is not available from your
1067 geographic location due to geographic restrictions imposed by a website.
1068 """
1069
1070 def __init__(self, msg, countries=None, **kwargs):
1071 kwargs['expected'] = True
1072 super().__init__(msg, **kwargs)
1073 self.countries = countries
1074
1075
1076 class DownloadError(YoutubeDLError):
1077 """Download Error exception.
1078
1079 This exception may be thrown by FileDownloader objects if they are not
1080 configured to continue on errors. They will contain the appropriate
1081 error message.
1082 """
1083
1084 def __init__(self, msg, exc_info=None):
1085 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1086 super().__init__(msg)
1087 self.exc_info = exc_info
1088
1089
1090 class EntryNotInPlaylist(YoutubeDLError):
1091 """Entry not in playlist exception.
1092
1093 This exception will be thrown by YoutubeDL when a requested entry
1094 is not found in the playlist info_dict
1095 """
1096 msg = 'Entry not found in info'
1097
1098
1099 class SameFileError(YoutubeDLError):
1100 """Same File exception.
1101
1102 This exception will be thrown by FileDownloader objects if they detect
1103 multiple files would have to be downloaded to the same file on disk.
1104 """
1105 msg = 'Fixed output name but more than one file to download'
1106
1107 def __init__(self, filename=None):
1108 if filename is not None:
1109 self.msg += f': {filename}'
1110 super().__init__(self.msg)
1111
1112
1113 class PostProcessingError(YoutubeDLError):
1114 """Post Processing exception.
1115
1116 This exception may be raised by PostProcessor's .run() method to
1117 indicate an error in the postprocessing task.
1118 """
1119
1120
1121 class DownloadCancelled(YoutubeDLError):
1122 """ Exception raised when the download queue should be interrupted """
1123 msg = 'The download was cancelled'
1124
1125
1126 class ExistingVideoReached(DownloadCancelled):
1127 """ --break-on-existing triggered """
1128 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1129
1130
1131 class RejectedVideoReached(DownloadCancelled):
1132 """ --break-on-reject triggered """
1133 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1134
1135
1136 class MaxDownloadsReached(DownloadCancelled):
1137 """ --max-downloads limit has been reached. """
1138 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1139
1140
1141 class ReExtractInfo(YoutubeDLError):
1142 """ Video info needs to be re-extracted. """
1143
1144 def __init__(self, msg, expected=False):
1145 super().__init__(msg)
1146 self.expected = expected
1147
1148
1149 class ThrottledDownload(ReExtractInfo):
1150 """ Download speed below --throttled-rate. """
1151 msg = 'The download speed is below throttle limit'
1152
1153 def __init__(self):
1154 super().__init__(self.msg, expected=False)
1155
1156
1157 class UnavailableVideoError(YoutubeDLError):
1158 """Unavailable Format exception.
1159
1160 This exception will be thrown when a video is requested
1161 in a format that is not available for that video.
1162 """
1163 msg = 'Unable to download video'
1164
1165 def __init__(self, err=None):
1166 if err is not None:
1167 self.msg += f': {err}'
1168 super().__init__(self.msg)
1169
1170
1171 class ContentTooShortError(YoutubeDLError):
1172 """Content Too Short exception.
1173
1174 This exception may be raised by FileDownloader objects when a file they
1175 download is too small for what the server announced first, indicating
1176 the connection was probably interrupted.
1177 """
1178
1179 def __init__(self, downloaded, expected):
1180 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1181 # Both in bytes
1182 self.downloaded = downloaded
1183 self.expected = expected
1184
1185
1186 class XAttrMetadataError(YoutubeDLError):
1187 def __init__(self, code=None, msg='Unknown error'):
1188 super().__init__(msg)
1189 self.code = code
1190 self.msg = msg
1191
1192 # Parsing code and msg
1193 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1194 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1195 self.reason = 'NO_SPACE'
1196 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1197 self.reason = 'VALUE_TOO_LONG'
1198 else:
1199 self.reason = 'NOT_SUPPORTED'
1200
1201
1202 class XAttrUnavailableError(YoutubeDLError):
1203 pass
1204
1205
1206 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1207 hc = http_class(*args, **kwargs)
1208 source_address = ydl_handler._params.get('source_address')
1209
1210 if source_address is not None:
1211 # This is to workaround _create_connection() from socket where it will try all
1212 # address data from getaddrinfo() including IPv6. This filters the result from
1213 # getaddrinfo() based on the source_address value.
1214 # This is based on the cpython socket.create_connection() function.
1215 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1216 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1217 host, port = address
1218 err = None
1219 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1220 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1221 ip_addrs = [addr for addr in addrs if addr[0] == af]
1222 if addrs and not ip_addrs:
1223 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1224 raise OSError(
1225 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1226 % (ip_version, source_address[0]))
1227 for res in ip_addrs:
1228 af, socktype, proto, canonname, sa = res
1229 sock = None
1230 try:
1231 sock = socket.socket(af, socktype, proto)
1232 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1233 sock.settimeout(timeout)
1234 sock.bind(source_address)
1235 sock.connect(sa)
1236 err = None # Explicitly break reference cycle
1237 return sock
1238 except OSError as _:
1239 err = _
1240 if sock is not None:
1241 sock.close()
1242 if err is not None:
1243 raise err
1244 else:
1245 raise OSError('getaddrinfo returns an empty list')
1246 if hasattr(hc, '_create_connection'):
1247 hc._create_connection = _create_connection
1248 hc.source_address = (source_address, 0)
1249
1250 return hc
1251
1252
1253 def handle_youtubedl_headers(headers):
1254 filtered_headers = headers
1255
1256 if 'Youtubedl-no-compression' in filtered_headers:
1257 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1258 del filtered_headers['Youtubedl-no-compression']
1259
1260 return filtered_headers
1261
1262
1263 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1264 """Handler for HTTP requests and responses.
1265
1266 This class, when installed with an OpenerDirector, automatically adds
1267 the standard headers to every HTTP request and handles gzipped and
1268 deflated responses from web servers. If compression is to be avoided in
1269 a particular request, the original request in the program code only has
1270 to include the HTTP header "Youtubedl-no-compression", which will be
1271 removed before making the real request.
1272
1273 Part of this code was copied from:
1274
1275 http://techknack.net/python-urllib2-handlers/
1276
1277 Andrew Rowls, the author of that code, agreed to release it to the
1278 public domain.
1279 """
1280
1281 def __init__(self, params, *args, **kwargs):
1282 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1283 self._params = params
1284
1285 def http_open(self, req):
1286 conn_class = compat_http_client.HTTPConnection
1287
1288 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1289 if socks_proxy:
1290 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1291 del req.headers['Ytdl-socks-proxy']
1292
1293 return self.do_open(functools.partial(
1294 _create_http_connection, self, conn_class, False),
1295 req)
1296
1297 @staticmethod
1298 def deflate(data):
1299 if not data:
1300 return data
1301 try:
1302 return zlib.decompress(data, -zlib.MAX_WBITS)
1303 except zlib.error:
1304 return zlib.decompress(data)
1305
1306 @staticmethod
1307 def brotli(data):
1308 if not data:
1309 return data
1310 return brotli.decompress(data)
1311
1312 def http_request(self, req):
1313 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1314 # always respected by websites, some tend to give out URLs with non percent-encoded
1315 # non-ASCII characters (see telemb.py, ard.py [#3412])
1316 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1317 # To work around aforementioned issue we will replace request's original URL with
1318 # percent-encoded one
1319 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1320 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1321 url = req.get_full_url()
1322 url_escaped = escape_url(url)
1323
1324 # Substitute URL if any change after escaping
1325 if url != url_escaped:
1326 req = update_Request(req, url=url_escaped)
1327
1328 for h, v in self._params.get('http_headers', std_headers).items():
1329 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1330 # The dict keys are capitalized because of this bug by urllib
1331 if h.capitalize() not in req.headers:
1332 req.add_header(h, v)
1333
1334 if 'Accept-encoding' not in req.headers:
1335 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1336
1337 req.headers = handle_youtubedl_headers(req.headers)
1338
1339 return req
1340
1341 def http_response(self, req, resp):
1342 old_resp = resp
1343 # gzip
1344 if resp.headers.get('Content-encoding', '') == 'gzip':
1345 content = resp.read()
1346 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1347 try:
1348 uncompressed = io.BytesIO(gz.read())
1349 except OSError as original_ioerror:
1350 # There may be junk add the end of the file
1351 # See http://stackoverflow.com/q/4928560/35070 for details
1352 for i in range(1, 1024):
1353 try:
1354 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1355 uncompressed = io.BytesIO(gz.read())
1356 except OSError:
1357 continue
1358 break
1359 else:
1360 raise original_ioerror
1361 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1362 resp.msg = old_resp.msg
1363 del resp.headers['Content-encoding']
1364 # deflate
1365 if resp.headers.get('Content-encoding', '') == 'deflate':
1366 gz = io.BytesIO(self.deflate(resp.read()))
1367 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1368 resp.msg = old_resp.msg
1369 del resp.headers['Content-encoding']
1370 # brotli
1371 if resp.headers.get('Content-encoding', '') == 'br':
1372 resp = compat_urllib_request.addinfourl(
1373 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1374 resp.msg = old_resp.msg
1375 del resp.headers['Content-encoding']
1376 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1377 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1378 if 300 <= resp.code < 400:
1379 location = resp.headers.get('Location')
1380 if location:
1381 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1382 location = location.encode('iso-8859-1').decode()
1383 location_escaped = escape_url(location)
1384 if location != location_escaped:
1385 del resp.headers['Location']
1386 resp.headers['Location'] = location_escaped
1387 return resp
1388
1389 https_request = http_request
1390 https_response = http_response
1391
1392
1393 def make_socks_conn_class(base_class, socks_proxy):
1394 assert issubclass(base_class, (
1395 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1396
1397 url_components = compat_urlparse.urlparse(socks_proxy)
1398 if url_components.scheme.lower() == 'socks5':
1399 socks_type = ProxyType.SOCKS5
1400 elif url_components.scheme.lower() in ('socks', 'socks4'):
1401 socks_type = ProxyType.SOCKS4
1402 elif url_components.scheme.lower() == 'socks4a':
1403 socks_type = ProxyType.SOCKS4A
1404
1405 def unquote_if_non_empty(s):
1406 if not s:
1407 return s
1408 return compat_urllib_parse_unquote_plus(s)
1409
1410 proxy_args = (
1411 socks_type,
1412 url_components.hostname, url_components.port or 1080,
1413 True, # Remote DNS
1414 unquote_if_non_empty(url_components.username),
1415 unquote_if_non_empty(url_components.password),
1416 )
1417
1418 class SocksConnection(base_class):
1419 def connect(self):
1420 self.sock = sockssocket()
1421 self.sock.setproxy(*proxy_args)
1422 if isinstance(self.timeout, (int, float)):
1423 self.sock.settimeout(self.timeout)
1424 self.sock.connect((self.host, self.port))
1425
1426 if isinstance(self, compat_http_client.HTTPSConnection):
1427 if hasattr(self, '_context'): # Python > 2.6
1428 self.sock = self._context.wrap_socket(
1429 self.sock, server_hostname=self.host)
1430 else:
1431 self.sock = ssl.wrap_socket(self.sock)
1432
1433 return SocksConnection
1434
1435
1436 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1437 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1438 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1439 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1440 self._params = params
1441
1442 def https_open(self, req):
1443 kwargs = {}
1444 conn_class = self._https_conn_class
1445
1446 if hasattr(self, '_context'): # python > 2.6
1447 kwargs['context'] = self._context
1448 if hasattr(self, '_check_hostname'): # python 3.x
1449 kwargs['check_hostname'] = self._check_hostname
1450
1451 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1452 if socks_proxy:
1453 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1454 del req.headers['Ytdl-socks-proxy']
1455
1456 try:
1457 return self.do_open(
1458 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1459 except urllib.error.URLError as e:
1460 if (isinstance(e.reason, ssl.SSLError)
1461 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1462 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1463 raise
1464
1465
1466 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1467 """
1468 See [1] for cookie file format.
1469
1470 1. https://curl.haxx.se/docs/http-cookies.html
1471 """
1472 _HTTPONLY_PREFIX = '#HttpOnly_'
1473 _ENTRY_LEN = 7
1474 _HEADER = '''# Netscape HTTP Cookie File
1475 # This file is generated by yt-dlp. Do not edit.
1476
1477 '''
1478 _CookieFileEntry = collections.namedtuple(
1479 'CookieFileEntry',
1480 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1481
1482 def __init__(self, filename=None, *args, **kwargs):
1483 super().__init__(None, *args, **kwargs)
1484 if self.is_path(filename):
1485 filename = os.fspath(filename)
1486 self.filename = filename
1487
1488 @staticmethod
1489 def _true_or_false(cndn):
1490 return 'TRUE' if cndn else 'FALSE'
1491
1492 @staticmethod
1493 def is_path(file):
1494 return isinstance(file, (str, bytes, os.PathLike))
1495
1496 @contextlib.contextmanager
1497 def open(self, file, *, write=False):
1498 if self.is_path(file):
1499 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1500 yield f
1501 else:
1502 if write:
1503 file.truncate(0)
1504 yield file
1505
1506 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1507 now = time.time()
1508 for cookie in self:
1509 if (not ignore_discard and cookie.discard
1510 or not ignore_expires and cookie.is_expired(now)):
1511 continue
1512 name, value = cookie.name, cookie.value
1513 if value is None:
1514 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1515 # with no name, whereas http.cookiejar regards it as a
1516 # cookie with no value.
1517 name, value = '', name
1518 f.write('%s\n' % '\t'.join((
1519 cookie.domain,
1520 self._true_or_false(cookie.domain.startswith('.')),
1521 cookie.path,
1522 self._true_or_false(cookie.secure),
1523 str_or_none(cookie.expires, default=''),
1524 name, value
1525 )))
1526
1527 def save(self, filename=None, *args, **kwargs):
1528 """
1529 Save cookies to a file.
1530 Code is taken from CPython 3.6
1531 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1532
1533 if filename is None:
1534 if self.filename is not None:
1535 filename = self.filename
1536 else:
1537 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1538
1539 # Store session cookies with `expires` set to 0 instead of an empty string
1540 for cookie in self:
1541 if cookie.expires is None:
1542 cookie.expires = 0
1543
1544 with self.open(filename, write=True) as f:
1545 f.write(self._HEADER)
1546 self._really_save(f, *args, **kwargs)
1547
1548 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1549 """Load cookies from a file."""
1550 if filename is None:
1551 if self.filename is not None:
1552 filename = self.filename
1553 else:
1554 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1555
1556 def prepare_line(line):
1557 if line.startswith(self._HTTPONLY_PREFIX):
1558 line = line[len(self._HTTPONLY_PREFIX):]
1559 # comments and empty lines are fine
1560 if line.startswith('#') or not line.strip():
1561 return line
1562 cookie_list = line.split('\t')
1563 if len(cookie_list) != self._ENTRY_LEN:
1564 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1565 cookie = self._CookieFileEntry(*cookie_list)
1566 if cookie.expires_at and not cookie.expires_at.isdigit():
1567 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1568 return line
1569
1570 cf = io.StringIO()
1571 with self.open(filename) as f:
1572 for line in f:
1573 try:
1574 cf.write(prepare_line(line))
1575 except compat_cookiejar.LoadError as e:
1576 if f'{line.strip()} '[0] in '[{"':
1577 raise compat_cookiejar.LoadError(
1578 'Cookies file must be Netscape formatted, not JSON. See '
1579 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1580 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1581 continue
1582 cf.seek(0)
1583 self._really_load(cf, filename, ignore_discard, ignore_expires)
1584 # Session cookies are denoted by either `expires` field set to
1585 # an empty string or 0. MozillaCookieJar only recognizes the former
1586 # (see [1]). So we need force the latter to be recognized as session
1587 # cookies on our own.
1588 # Session cookies may be important for cookies-based authentication,
1589 # e.g. usually, when user does not check 'Remember me' check box while
1590 # logging in on a site, some important cookies are stored as session
1591 # cookies so that not recognizing them will result in failed login.
1592 # 1. https://bugs.python.org/issue17164
1593 for cookie in self:
1594 # Treat `expires=0` cookies as session cookies
1595 if cookie.expires == 0:
1596 cookie.expires = None
1597 cookie.discard = True
1598
1599
1600 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1601 def __init__(self, cookiejar=None):
1602 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1603
1604 def http_response(self, request, response):
1605 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1606
1607 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1608 https_response = http_response
1609
1610
1611 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1612 """YoutubeDL redirect handler
1613
1614 The code is based on HTTPRedirectHandler implementation from CPython [1].
1615
1616 This redirect handler solves two issues:
1617 - ensures redirect URL is always unicode under python 2
1618 - introduces support for experimental HTTP response status code
1619 308 Permanent Redirect [2] used by some sites [3]
1620
1621 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1622 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1623 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1624 """
1625
1626 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1627
1628 def redirect_request(self, req, fp, code, msg, headers, newurl):
1629 """Return a Request or None in response to a redirect.
1630
1631 This is called by the http_error_30x methods when a
1632 redirection response is received. If a redirection should
1633 take place, return a new Request to allow http_error_30x to
1634 perform the redirect. Otherwise, raise HTTPError if no-one
1635 else should try to handle this url. Return None if you can't
1636 but another Handler might.
1637 """
1638 m = req.get_method()
1639 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1640 or code in (301, 302, 303) and m == "POST")):
1641 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1642 # Strictly (according to RFC 2616), 301 or 302 in response to
1643 # a POST MUST NOT cause a redirection without confirmation
1644 # from the user (of urllib.request, in this case). In practice,
1645 # essentially all clients do redirect in this case, so we do
1646 # the same.
1647
1648 # Be conciliant with URIs containing a space. This is mainly
1649 # redundant with the more complete encoding done in http_error_302(),
1650 # but it is kept for compatibility with other callers.
1651 newurl = newurl.replace(' ', '%20')
1652
1653 CONTENT_HEADERS = ("content-length", "content-type")
1654 # NB: don't use dict comprehension for python 2.6 compatibility
1655 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1656
1657 # A 303 must either use GET or HEAD for subsequent request
1658 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1659 if code == 303 and m != 'HEAD':
1660 m = 'GET'
1661 # 301 and 302 redirects are commonly turned into a GET from a POST
1662 # for subsequent requests by browsers, so we'll do the same.
1663 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1664 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1665 if code in (301, 302) and m == 'POST':
1666 m = 'GET'
1667
1668 return compat_urllib_request.Request(
1669 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1670 unverifiable=True, method=m)
1671
1672
1673 def extract_timezone(date_str):
1674 m = re.search(
1675 r'''(?x)
1676 ^.{8,}? # >=8 char non-TZ prefix, if present
1677 (?P<tz>Z| # just the UTC Z, or
1678 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1679 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1680 [ ]? # optional space
1681 (?P<sign>\+|-) # +/-
1682 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1683 $)
1684 ''', date_str)
1685 if not m:
1686 timezone = datetime.timedelta()
1687 else:
1688 date_str = date_str[:-len(m.group('tz'))]
1689 if not m.group('sign'):
1690 timezone = datetime.timedelta()
1691 else:
1692 sign = 1 if m.group('sign') == '+' else -1
1693 timezone = datetime.timedelta(
1694 hours=sign * int(m.group('hours')),
1695 minutes=sign * int(m.group('minutes')))
1696 return timezone, date_str
1697
1698
1699 def parse_iso8601(date_str, delimiter='T', timezone=None):
1700 """ Return a UNIX timestamp from the given date """
1701
1702 if date_str is None:
1703 return None
1704
1705 date_str = re.sub(r'\.[0-9]+', '', date_str)
1706
1707 if timezone is None:
1708 timezone, date_str = extract_timezone(date_str)
1709
1710 with contextlib.suppress(ValueError):
1711 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1712 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1713 return calendar.timegm(dt.timetuple())
1714
1715
1716 def date_formats(day_first=True):
1717 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1718
1719
1720 def unified_strdate(date_str, day_first=True):
1721 """Return a string with the date in the format YYYYMMDD"""
1722
1723 if date_str is None:
1724 return None
1725 upload_date = None
1726 # Replace commas
1727 date_str = date_str.replace(',', ' ')
1728 # Remove AM/PM + timezone
1729 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1730 _, date_str = extract_timezone(date_str)
1731
1732 for expression in date_formats(day_first):
1733 with contextlib.suppress(ValueError):
1734 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1735 if upload_date is None:
1736 timetuple = email.utils.parsedate_tz(date_str)
1737 if timetuple:
1738 with contextlib.suppress(ValueError):
1739 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1740 if upload_date is not None:
1741 return compat_str(upload_date)
1742
1743
1744 def unified_timestamp(date_str, day_first=True):
1745 if date_str is None:
1746 return None
1747
1748 date_str = re.sub(r'[,|]', '', date_str)
1749
1750 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1751 timezone, date_str = extract_timezone(date_str)
1752
1753 # Remove AM/PM + timezone
1754 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1755
1756 # Remove unrecognized timezones from ISO 8601 alike timestamps
1757 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1758 if m:
1759 date_str = date_str[:-len(m.group('tz'))]
1760
1761 # Python only supports microseconds, so remove nanoseconds
1762 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1763 if m:
1764 date_str = m.group(1)
1765
1766 for expression in date_formats(day_first):
1767 with contextlib.suppress(ValueError):
1768 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1769 return calendar.timegm(dt.timetuple())
1770 timetuple = email.utils.parsedate_tz(date_str)
1771 if timetuple:
1772 return calendar.timegm(timetuple) + pm_delta * 3600
1773
1774
1775 def determine_ext(url, default_ext='unknown_video'):
1776 if url is None or '.' not in url:
1777 return default_ext
1778 guess = url.partition('?')[0].rpartition('.')[2]
1779 if re.match(r'^[A-Za-z0-9]+$', guess):
1780 return guess
1781 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1782 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1783 return guess.rstrip('/')
1784 else:
1785 return default_ext
1786
1787
1788 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1789 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1790
1791
1792 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1793 R"""
1794 Return a datetime object from a string.
1795 Supported format:
1796 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1797
1798 @param format strftime format of DATE
1799 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1800 auto: round to the unit provided in date_str (if applicable).
1801 """
1802 auto_precision = False
1803 if precision == 'auto':
1804 auto_precision = True
1805 precision = 'microsecond'
1806 today = datetime_round(datetime.datetime.utcnow(), precision)
1807 if date_str in ('now', 'today'):
1808 return today
1809 if date_str == 'yesterday':
1810 return today - datetime.timedelta(days=1)
1811 match = re.match(
1812 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1813 date_str)
1814 if match is not None:
1815 start_time = datetime_from_str(match.group('start'), precision, format)
1816 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1817 unit = match.group('unit')
1818 if unit == 'month' or unit == 'year':
1819 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1820 unit = 'day'
1821 else:
1822 if unit == 'week':
1823 unit = 'day'
1824 time *= 7
1825 delta = datetime.timedelta(**{unit + 's': time})
1826 new_date = start_time + delta
1827 if auto_precision:
1828 return datetime_round(new_date, unit)
1829 return new_date
1830
1831 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1832
1833
1834 def date_from_str(date_str, format='%Y%m%d', strict=False):
1835 R"""
1836 Return a date object from a string using datetime_from_str
1837
1838 @param strict Restrict allowed patterns to "YYYYMMDD" and
1839 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1840 """
1841 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1842 raise ValueError(f'Invalid date format "{date_str}"')
1843 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1844
1845
1846 def datetime_add_months(dt, months):
1847 """Increment/Decrement a datetime object by months."""
1848 month = dt.month + months - 1
1849 year = dt.year + month // 12
1850 month = month % 12 + 1
1851 day = min(dt.day, calendar.monthrange(year, month)[1])
1852 return dt.replace(year, month, day)
1853
1854
1855 def datetime_round(dt, precision='day'):
1856 """
1857 Round a datetime object's time to a specific precision
1858 """
1859 if precision == 'microsecond':
1860 return dt
1861
1862 unit_seconds = {
1863 'day': 86400,
1864 'hour': 3600,
1865 'minute': 60,
1866 'second': 1,
1867 }
1868 roundto = lambda x, n: ((x + n / 2) // n) * n
1869 timestamp = calendar.timegm(dt.timetuple())
1870 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1871
1872
1873 def hyphenate_date(date_str):
1874 """
1875 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1876 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1877 if match is not None:
1878 return '-'.join(match.groups())
1879 else:
1880 return date_str
1881
1882
1883 class DateRange:
1884 """Represents a time interval between two dates"""
1885
1886 def __init__(self, start=None, end=None):
1887 """start and end must be strings in the format accepted by date"""
1888 if start is not None:
1889 self.start = date_from_str(start, strict=True)
1890 else:
1891 self.start = datetime.datetime.min.date()
1892 if end is not None:
1893 self.end = date_from_str(end, strict=True)
1894 else:
1895 self.end = datetime.datetime.max.date()
1896 if self.start > self.end:
1897 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1898
1899 @classmethod
1900 def day(cls, day):
1901 """Returns a range that only contains the given day"""
1902 return cls(day, day)
1903
1904 def __contains__(self, date):
1905 """Check if the date is in the range"""
1906 if not isinstance(date, datetime.date):
1907 date = date_from_str(date)
1908 return self.start <= date <= self.end
1909
1910 def __str__(self):
1911 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1912
1913
1914 def platform_name():
1915 """ Returns the platform name as a compat_str """
1916 res = platform.platform()
1917 if isinstance(res, bytes):
1918 res = res.decode(preferredencoding())
1919
1920 assert isinstance(res, compat_str)
1921 return res
1922
1923
1924 @functools.cache
1925 def get_windows_version():
1926 ''' Get Windows version. returns () if it's not running on Windows '''
1927 if compat_os_name == 'nt':
1928 return version_tuple(platform.win32_ver()[1])
1929 else:
1930 return ()
1931
1932
1933 def write_string(s, out=None, encoding=None):
1934 assert isinstance(s, str)
1935 out = out or sys.stderr
1936
1937 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1938 s = re.sub(r'([\r\n]+)', r' \1', s)
1939
1940 enc, buffer = None, out
1941 if 'b' in getattr(out, 'mode', ''):
1942 enc = encoding or preferredencoding()
1943 elif hasattr(out, 'buffer'):
1944 buffer = out.buffer
1945 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1946
1947 buffer.write(s.encode(enc, 'ignore') if enc else s)
1948 out.flush()
1949
1950
1951 def bytes_to_intlist(bs):
1952 if not bs:
1953 return []
1954 if isinstance(bs[0], int): # Python 3
1955 return list(bs)
1956 else:
1957 return [ord(c) for c in bs]
1958
1959
1960 def intlist_to_bytes(xs):
1961 if not xs:
1962 return b''
1963 return compat_struct_pack('%dB' % len(xs), *xs)
1964
1965
1966 class LockingUnsupportedError(OSError):
1967 msg = 'File locking is not supported'
1968
1969 def __init__(self):
1970 super().__init__(self.msg)
1971
1972
1973 # Cross-platform file locking
1974 if sys.platform == 'win32':
1975 import ctypes.wintypes
1976 import msvcrt
1977
1978 class OVERLAPPED(ctypes.Structure):
1979 _fields_ = [
1980 ('Internal', ctypes.wintypes.LPVOID),
1981 ('InternalHigh', ctypes.wintypes.LPVOID),
1982 ('Offset', ctypes.wintypes.DWORD),
1983 ('OffsetHigh', ctypes.wintypes.DWORD),
1984 ('hEvent', ctypes.wintypes.HANDLE),
1985 ]
1986
1987 kernel32 = ctypes.windll.kernel32
1988 LockFileEx = kernel32.LockFileEx
1989 LockFileEx.argtypes = [
1990 ctypes.wintypes.HANDLE, # hFile
1991 ctypes.wintypes.DWORD, # dwFlags
1992 ctypes.wintypes.DWORD, # dwReserved
1993 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1994 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1995 ctypes.POINTER(OVERLAPPED) # Overlapped
1996 ]
1997 LockFileEx.restype = ctypes.wintypes.BOOL
1998 UnlockFileEx = kernel32.UnlockFileEx
1999 UnlockFileEx.argtypes = [
2000 ctypes.wintypes.HANDLE, # hFile
2001 ctypes.wintypes.DWORD, # dwReserved
2002 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2003 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2004 ctypes.POINTER(OVERLAPPED) # Overlapped
2005 ]
2006 UnlockFileEx.restype = ctypes.wintypes.BOOL
2007 whole_low = 0xffffffff
2008 whole_high = 0x7fffffff
2009
2010 def _lock_file(f, exclusive, block):
2011 overlapped = OVERLAPPED()
2012 overlapped.Offset = 0
2013 overlapped.OffsetHigh = 0
2014 overlapped.hEvent = 0
2015 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2016
2017 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2018 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2019 0, whole_low, whole_high, f._lock_file_overlapped_p):
2020 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2021 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2022
2023 def _unlock_file(f):
2024 assert f._lock_file_overlapped_p
2025 handle = msvcrt.get_osfhandle(f.fileno())
2026 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2027 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2028
2029 else:
2030 try:
2031 import fcntl
2032
2033 def _lock_file(f, exclusive, block):
2034 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2035 if not block:
2036 flags |= fcntl.LOCK_NB
2037 try:
2038 fcntl.flock(f, flags)
2039 except BlockingIOError:
2040 raise
2041 except OSError: # AOSP does not have flock()
2042 fcntl.lockf(f, flags)
2043
2044 def _unlock_file(f):
2045 try:
2046 fcntl.flock(f, fcntl.LOCK_UN)
2047 except OSError:
2048 fcntl.lockf(f, fcntl.LOCK_UN)
2049
2050 except ImportError:
2051
2052 def _lock_file(f, exclusive, block):
2053 raise LockingUnsupportedError()
2054
2055 def _unlock_file(f):
2056 raise LockingUnsupportedError()
2057
2058
2059 class locked_file:
2060 locked = False
2061
2062 def __init__(self, filename, mode, block=True, encoding=None):
2063 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2064 raise NotImplementedError(mode)
2065 self.mode, self.block = mode, block
2066
2067 writable = any(f in mode for f in 'wax+')
2068 readable = any(f in mode for f in 'r+')
2069 flags = functools.reduce(operator.ior, (
2070 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2071 getattr(os, 'O_BINARY', 0), # Windows only
2072 getattr(os, 'O_NOINHERIT', 0), # Windows only
2073 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2074 os.O_APPEND if 'a' in mode else 0,
2075 os.O_EXCL if 'x' in mode else 0,
2076 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2077 ))
2078
2079 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2080
2081 def __enter__(self):
2082 exclusive = 'r' not in self.mode
2083 try:
2084 _lock_file(self.f, exclusive, self.block)
2085 self.locked = True
2086 except OSError:
2087 self.f.close()
2088 raise
2089 if 'w' in self.mode:
2090 try:
2091 self.f.truncate()
2092 except OSError as e:
2093 if e.errno not in (
2094 errno.ESPIPE, # Illegal seek - expected for FIFO
2095 errno.EINVAL, # Invalid argument - expected for /dev/null
2096 ):
2097 raise
2098 return self
2099
2100 def unlock(self):
2101 if not self.locked:
2102 return
2103 try:
2104 _unlock_file(self.f)
2105 finally:
2106 self.locked = False
2107
2108 def __exit__(self, *_):
2109 try:
2110 self.unlock()
2111 finally:
2112 self.f.close()
2113
2114 open = __enter__
2115 close = __exit__
2116
2117 def __getattr__(self, attr):
2118 return getattr(self.f, attr)
2119
2120 def __iter__(self):
2121 return iter(self.f)
2122
2123
2124 @functools.cache
2125 def get_filesystem_encoding():
2126 encoding = sys.getfilesystemencoding()
2127 return encoding if encoding is not None else 'utf-8'
2128
2129
2130 def shell_quote(args):
2131 quoted_args = []
2132 encoding = get_filesystem_encoding()
2133 for a in args:
2134 if isinstance(a, bytes):
2135 # We may get a filename encoded with 'encodeFilename'
2136 a = a.decode(encoding)
2137 quoted_args.append(compat_shlex_quote(a))
2138 return ' '.join(quoted_args)
2139
2140
2141 def smuggle_url(url, data):
2142 """ Pass additional data in a URL for internal use. """
2143
2144 url, idata = unsmuggle_url(url, {})
2145 data.update(idata)
2146 sdata = compat_urllib_parse_urlencode(
2147 {'__youtubedl_smuggle': json.dumps(data)})
2148 return url + '#' + sdata
2149
2150
2151 def unsmuggle_url(smug_url, default=None):
2152 if '#__youtubedl_smuggle' not in smug_url:
2153 return smug_url, default
2154 url, _, sdata = smug_url.rpartition('#')
2155 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2156 data = json.loads(jsond)
2157 return url, data
2158
2159
2160 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2161 """ Formats numbers with decimal sufixes like K, M, etc """
2162 num, factor = float_or_none(num), float(factor)
2163 if num is None or num < 0:
2164 return None
2165 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2166 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2167 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2168 if factor == 1024:
2169 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2170 converted = num / (factor ** exponent)
2171 return fmt % (converted, suffix)
2172
2173
2174 def format_bytes(bytes):
2175 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2176
2177
2178 def lookup_unit_table(unit_table, s):
2179 units_re = '|'.join(re.escape(u) for u in unit_table)
2180 m = re.match(
2181 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2182 if not m:
2183 return None
2184 num_str = m.group('num').replace(',', '.')
2185 mult = unit_table[m.group('unit')]
2186 return int(float(num_str) * mult)
2187
2188
2189 def parse_filesize(s):
2190 if s is None:
2191 return None
2192
2193 # The lower-case forms are of course incorrect and unofficial,
2194 # but we support those too
2195 _UNIT_TABLE = {
2196 'B': 1,
2197 'b': 1,
2198 'bytes': 1,
2199 'KiB': 1024,
2200 'KB': 1000,
2201 'kB': 1024,
2202 'Kb': 1000,
2203 'kb': 1000,
2204 'kilobytes': 1000,
2205 'kibibytes': 1024,
2206 'MiB': 1024 ** 2,
2207 'MB': 1000 ** 2,
2208 'mB': 1024 ** 2,
2209 'Mb': 1000 ** 2,
2210 'mb': 1000 ** 2,
2211 'megabytes': 1000 ** 2,
2212 'mebibytes': 1024 ** 2,
2213 'GiB': 1024 ** 3,
2214 'GB': 1000 ** 3,
2215 'gB': 1024 ** 3,
2216 'Gb': 1000 ** 3,
2217 'gb': 1000 ** 3,
2218 'gigabytes': 1000 ** 3,
2219 'gibibytes': 1024 ** 3,
2220 'TiB': 1024 ** 4,
2221 'TB': 1000 ** 4,
2222 'tB': 1024 ** 4,
2223 'Tb': 1000 ** 4,
2224 'tb': 1000 ** 4,
2225 'terabytes': 1000 ** 4,
2226 'tebibytes': 1024 ** 4,
2227 'PiB': 1024 ** 5,
2228 'PB': 1000 ** 5,
2229 'pB': 1024 ** 5,
2230 'Pb': 1000 ** 5,
2231 'pb': 1000 ** 5,
2232 'petabytes': 1000 ** 5,
2233 'pebibytes': 1024 ** 5,
2234 'EiB': 1024 ** 6,
2235 'EB': 1000 ** 6,
2236 'eB': 1024 ** 6,
2237 'Eb': 1000 ** 6,
2238 'eb': 1000 ** 6,
2239 'exabytes': 1000 ** 6,
2240 'exbibytes': 1024 ** 6,
2241 'ZiB': 1024 ** 7,
2242 'ZB': 1000 ** 7,
2243 'zB': 1024 ** 7,
2244 'Zb': 1000 ** 7,
2245 'zb': 1000 ** 7,
2246 'zettabytes': 1000 ** 7,
2247 'zebibytes': 1024 ** 7,
2248 'YiB': 1024 ** 8,
2249 'YB': 1000 ** 8,
2250 'yB': 1024 ** 8,
2251 'Yb': 1000 ** 8,
2252 'yb': 1000 ** 8,
2253 'yottabytes': 1000 ** 8,
2254 'yobibytes': 1024 ** 8,
2255 }
2256
2257 return lookup_unit_table(_UNIT_TABLE, s)
2258
2259
2260 def parse_count(s):
2261 if s is None:
2262 return None
2263
2264 s = re.sub(r'^[^\d]+\s', '', s).strip()
2265
2266 if re.match(r'^[\d,.]+$', s):
2267 return str_to_int(s)
2268
2269 _UNIT_TABLE = {
2270 'k': 1000,
2271 'K': 1000,
2272 'm': 1000 ** 2,
2273 'M': 1000 ** 2,
2274 'kk': 1000 ** 2,
2275 'KK': 1000 ** 2,
2276 'b': 1000 ** 3,
2277 'B': 1000 ** 3,
2278 }
2279
2280 ret = lookup_unit_table(_UNIT_TABLE, s)
2281 if ret is not None:
2282 return ret
2283
2284 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2285 if mobj:
2286 return str_to_int(mobj.group(1))
2287
2288
2289 def parse_resolution(s, *, lenient=False):
2290 if s is None:
2291 return {}
2292
2293 if lenient:
2294 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2295 else:
2296 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2297 if mobj:
2298 return {
2299 'width': int(mobj.group('w')),
2300 'height': int(mobj.group('h')),
2301 }
2302
2303 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2304 if mobj:
2305 return {'height': int(mobj.group(1))}
2306
2307 mobj = re.search(r'\b([48])[kK]\b', s)
2308 if mobj:
2309 return {'height': int(mobj.group(1)) * 540}
2310
2311 return {}
2312
2313
2314 def parse_bitrate(s):
2315 if not isinstance(s, compat_str):
2316 return
2317 mobj = re.search(r'\b(\d+)\s*kbps', s)
2318 if mobj:
2319 return int(mobj.group(1))
2320
2321
2322 def month_by_name(name, lang='en'):
2323 """ Return the number of a month by (locale-independently) English name """
2324
2325 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2326
2327 try:
2328 return month_names.index(name) + 1
2329 except ValueError:
2330 return None
2331
2332
2333 def month_by_abbreviation(abbrev):
2334 """ Return the number of a month by (locale-independently) English
2335 abbreviations """
2336
2337 try:
2338 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2339 except ValueError:
2340 return None
2341
2342
2343 def fix_xml_ampersands(xml_str):
2344 """Replace all the '&' by '&amp;' in XML"""
2345 return re.sub(
2346 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2347 '&amp;',
2348 xml_str)
2349
2350
2351 def setproctitle(title):
2352 assert isinstance(title, compat_str)
2353
2354 # ctypes in Jython is not complete
2355 # http://bugs.jython.org/issue2148
2356 if sys.platform.startswith('java'):
2357 return
2358
2359 try:
2360 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2361 except OSError:
2362 return
2363 except TypeError:
2364 # LoadLibrary in Windows Python 2.7.13 only expects
2365 # a bytestring, but since unicode_literals turns
2366 # every string into a unicode string, it fails.
2367 return
2368 title_bytes = title.encode()
2369 buf = ctypes.create_string_buffer(len(title_bytes))
2370 buf.value = title_bytes
2371 try:
2372 libc.prctl(15, buf, 0, 0, 0)
2373 except AttributeError:
2374 return # Strange libc, just skip this
2375
2376
2377 def remove_start(s, start):
2378 return s[len(start):] if s is not None and s.startswith(start) else s
2379
2380
2381 def remove_end(s, end):
2382 return s[:-len(end)] if s is not None and s.endswith(end) else s
2383
2384
2385 def remove_quotes(s):
2386 if s is None or len(s) < 2:
2387 return s
2388 for quote in ('"', "'", ):
2389 if s[0] == quote and s[-1] == quote:
2390 return s[1:-1]
2391 return s
2392
2393
2394 def get_domain(url):
2395 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2396 return domain.group('domain') if domain else None
2397
2398
2399 def url_basename(url):
2400 path = compat_urlparse.urlparse(url).path
2401 return path.strip('/').split('/')[-1]
2402
2403
2404 def base_url(url):
2405 return re.match(r'https?://[^?#&]+/', url).group()
2406
2407
2408 def urljoin(base, path):
2409 if isinstance(path, bytes):
2410 path = path.decode()
2411 if not isinstance(path, compat_str) or not path:
2412 return None
2413 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2414 return path
2415 if isinstance(base, bytes):
2416 base = base.decode()
2417 if not isinstance(base, compat_str) or not re.match(
2418 r'^(?:https?:)?//', base):
2419 return None
2420 return compat_urlparse.urljoin(base, path)
2421
2422
2423 class HEADRequest(compat_urllib_request.Request):
2424 def get_method(self):
2425 return 'HEAD'
2426
2427
2428 class PUTRequest(compat_urllib_request.Request):
2429 def get_method(self):
2430 return 'PUT'
2431
2432
2433 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2434 if get_attr and v is not None:
2435 v = getattr(v, get_attr, None)
2436 try:
2437 return int(v) * invscale // scale
2438 except (ValueError, TypeError, OverflowError):
2439 return default
2440
2441
2442 def str_or_none(v, default=None):
2443 return default if v is None else compat_str(v)
2444
2445
2446 def str_to_int(int_str):
2447 """ A more relaxed version of int_or_none """
2448 if isinstance(int_str, int):
2449 return int_str
2450 elif isinstance(int_str, compat_str):
2451 int_str = re.sub(r'[,\.\+]', '', int_str)
2452 return int_or_none(int_str)
2453
2454
2455 def float_or_none(v, scale=1, invscale=1, default=None):
2456 if v is None:
2457 return default
2458 try:
2459 return float(v) * invscale / scale
2460 except (ValueError, TypeError):
2461 return default
2462
2463
2464 def bool_or_none(v, default=None):
2465 return v if isinstance(v, bool) else default
2466
2467
2468 def strip_or_none(v, default=None):
2469 return v.strip() if isinstance(v, compat_str) else default
2470
2471
2472 def url_or_none(url):
2473 if not url or not isinstance(url, compat_str):
2474 return None
2475 url = url.strip()
2476 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2477
2478
2479 def request_to_url(req):
2480 if isinstance(req, compat_urllib_request.Request):
2481 return req.get_full_url()
2482 else:
2483 return req
2484
2485
2486 def strftime_or_none(timestamp, date_format, default=None):
2487 datetime_object = None
2488 try:
2489 if isinstance(timestamp, (int, float)): # unix timestamp
2490 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2491 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2492 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2493 return datetime_object.strftime(date_format)
2494 except (ValueError, TypeError, AttributeError):
2495 return default
2496
2497
2498 def parse_duration(s):
2499 if not isinstance(s, str):
2500 return None
2501 s = s.strip()
2502 if not s:
2503 return None
2504
2505 days, hours, mins, secs, ms = [None] * 5
2506 m = re.match(r'''(?x)
2507 (?P<before_secs>
2508 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2509 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2510 (?P<ms>[.:][0-9]+)?Z?$
2511 ''', s)
2512 if m:
2513 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2514 else:
2515 m = re.match(
2516 r'''(?ix)(?:P?
2517 (?:
2518 [0-9]+\s*y(?:ears?)?,?\s*
2519 )?
2520 (?:
2521 [0-9]+\s*m(?:onths?)?,?\s*
2522 )?
2523 (?:
2524 [0-9]+\s*w(?:eeks?)?,?\s*
2525 )?
2526 (?:
2527 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2528 )?
2529 T)?
2530 (?:
2531 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2532 )?
2533 (?:
2534 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2535 )?
2536 (?:
2537 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2538 )?Z?$''', s)
2539 if m:
2540 days, hours, mins, secs, ms = m.groups()
2541 else:
2542 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2543 if m:
2544 hours, mins = m.groups()
2545 else:
2546 return None
2547
2548 if ms:
2549 ms = ms.replace(':', '.')
2550 return sum(float(part or 0) * mult for part, mult in (
2551 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2552
2553
2554 def prepend_extension(filename, ext, expected_real_ext=None):
2555 name, real_ext = os.path.splitext(filename)
2556 return (
2557 f'{name}.{ext}{real_ext}'
2558 if not expected_real_ext or real_ext[1:] == expected_real_ext
2559 else f'{filename}.{ext}')
2560
2561
2562 def replace_extension(filename, ext, expected_real_ext=None):
2563 name, real_ext = os.path.splitext(filename)
2564 return '{}.{}'.format(
2565 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2566 ext)
2567
2568
2569 def check_executable(exe, args=[]):
2570 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2571 args can be a list of arguments for a short output (like -version) """
2572 try:
2573 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2574 except OSError:
2575 return False
2576 return exe
2577
2578
2579 def _get_exe_version_output(exe, args, *, to_screen=None):
2580 if to_screen:
2581 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2582 try:
2583 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2584 # SIGTTOU if yt-dlp is run in the background.
2585 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2586 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2587 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2588 except OSError:
2589 return False
2590 return stdout
2591
2592
2593 def detect_exe_version(output, version_re=None, unrecognized='present'):
2594 assert isinstance(output, compat_str)
2595 if version_re is None:
2596 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2597 m = re.search(version_re, output)
2598 if m:
2599 return m.group(1)
2600 else:
2601 return unrecognized
2602
2603
2604 def get_exe_version(exe, args=['--version'],
2605 version_re=None, unrecognized='present'):
2606 """ Returns the version of the specified executable,
2607 or False if the executable is not present """
2608 out = _get_exe_version_output(exe, args)
2609 return detect_exe_version(out, version_re, unrecognized) if out else False
2610
2611
2612 def frange(start=0, stop=None, step=1):
2613 """Float range"""
2614 if stop is None:
2615 start, stop = 0, start
2616 sign = [-1, 1][step > 0] if step else 0
2617 while sign * start < sign * stop:
2618 yield start
2619 start += step
2620
2621
2622 class LazyList(collections.abc.Sequence):
2623 """Lazy immutable list from an iterable
2624 Note that slices of a LazyList are lists and not LazyList"""
2625
2626 class IndexError(IndexError):
2627 pass
2628
2629 def __init__(self, iterable, *, reverse=False, _cache=None):
2630 self._iterable = iter(iterable)
2631 self._cache = [] if _cache is None else _cache
2632 self._reversed = reverse
2633
2634 def __iter__(self):
2635 if self._reversed:
2636 # We need to consume the entire iterable to iterate in reverse
2637 yield from self.exhaust()
2638 return
2639 yield from self._cache
2640 for item in self._iterable:
2641 self._cache.append(item)
2642 yield item
2643
2644 def _exhaust(self):
2645 self._cache.extend(self._iterable)
2646 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2647 return self._cache
2648
2649 def exhaust(self):
2650 """Evaluate the entire iterable"""
2651 return self._exhaust()[::-1 if self._reversed else 1]
2652
2653 @staticmethod
2654 def _reverse_index(x):
2655 return None if x is None else -(x + 1)
2656
2657 def __getitem__(self, idx):
2658 if isinstance(idx, slice):
2659 if self._reversed:
2660 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2661 start, stop, step = idx.start, idx.stop, idx.step or 1
2662 elif isinstance(idx, int):
2663 if self._reversed:
2664 idx = self._reverse_index(idx)
2665 start, stop, step = idx, idx, 0
2666 else:
2667 raise TypeError('indices must be integers or slices')
2668 if ((start or 0) < 0 or (stop or 0) < 0
2669 or (start is None and step < 0)
2670 or (stop is None and step > 0)):
2671 # We need to consume the entire iterable to be able to slice from the end
2672 # Obviously, never use this with infinite iterables
2673 self._exhaust()
2674 try:
2675 return self._cache[idx]
2676 except IndexError as e:
2677 raise self.IndexError(e) from e
2678 n = max(start or 0, stop or 0) - len(self._cache) + 1
2679 if n > 0:
2680 self._cache.extend(itertools.islice(self._iterable, n))
2681 try:
2682 return self._cache[idx]
2683 except IndexError as e:
2684 raise self.IndexError(e) from e
2685
2686 def __bool__(self):
2687 try:
2688 self[-1] if self._reversed else self[0]
2689 except self.IndexError:
2690 return False
2691 return True
2692
2693 def __len__(self):
2694 self._exhaust()
2695 return len(self._cache)
2696
2697 def __reversed__(self):
2698 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2699
2700 def __copy__(self):
2701 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2702
2703 def __repr__(self):
2704 # repr and str should mimic a list. So we exhaust the iterable
2705 return repr(self.exhaust())
2706
2707 def __str__(self):
2708 return repr(self.exhaust())
2709
2710
2711 class PagedList:
2712
2713 class IndexError(IndexError):
2714 pass
2715
2716 def __len__(self):
2717 # This is only useful for tests
2718 return len(self.getslice())
2719
2720 def __init__(self, pagefunc, pagesize, use_cache=True):
2721 self._pagefunc = pagefunc
2722 self._pagesize = pagesize
2723 self._pagecount = float('inf')
2724 self._use_cache = use_cache
2725 self._cache = {}
2726
2727 def getpage(self, pagenum):
2728 page_results = self._cache.get(pagenum)
2729 if page_results is None:
2730 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2731 if self._use_cache:
2732 self._cache[pagenum] = page_results
2733 return page_results
2734
2735 def getslice(self, start=0, end=None):
2736 return list(self._getslice(start, end))
2737
2738 def _getslice(self, start, end):
2739 raise NotImplementedError('This method must be implemented by subclasses')
2740
2741 def __getitem__(self, idx):
2742 assert self._use_cache, 'Indexing PagedList requires cache'
2743 if not isinstance(idx, int) or idx < 0:
2744 raise TypeError('indices must be non-negative integers')
2745 entries = self.getslice(idx, idx + 1)
2746 if not entries:
2747 raise self.IndexError()
2748 return entries[0]
2749
2750
2751 class OnDemandPagedList(PagedList):
2752 """Download pages until a page with less than maximum results"""
2753
2754 def _getslice(self, start, end):
2755 for pagenum in itertools.count(start // self._pagesize):
2756 firstid = pagenum * self._pagesize
2757 nextfirstid = pagenum * self._pagesize + self._pagesize
2758 if start >= nextfirstid:
2759 continue
2760
2761 startv = (
2762 start % self._pagesize
2763 if firstid <= start < nextfirstid
2764 else 0)
2765 endv = (
2766 ((end - 1) % self._pagesize) + 1
2767 if (end is not None and firstid <= end <= nextfirstid)
2768 else None)
2769
2770 try:
2771 page_results = self.getpage(pagenum)
2772 except Exception:
2773 self._pagecount = pagenum - 1
2774 raise
2775 if startv != 0 or endv is not None:
2776 page_results = page_results[startv:endv]
2777 yield from page_results
2778
2779 # A little optimization - if current page is not "full", ie. does
2780 # not contain page_size videos then we can assume that this page
2781 # is the last one - there are no more ids on further pages -
2782 # i.e. no need to query again.
2783 if len(page_results) + startv < self._pagesize:
2784 break
2785
2786 # If we got the whole page, but the next page is not interesting,
2787 # break out early as well
2788 if end == nextfirstid:
2789 break
2790
2791
2792 class InAdvancePagedList(PagedList):
2793 """PagedList with total number of pages known in advance"""
2794
2795 def __init__(self, pagefunc, pagecount, pagesize):
2796 PagedList.__init__(self, pagefunc, pagesize, True)
2797 self._pagecount = pagecount
2798
2799 def _getslice(self, start, end):
2800 start_page = start // self._pagesize
2801 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2802 skip_elems = start - start_page * self._pagesize
2803 only_more = None if end is None else end - start
2804 for pagenum in range(start_page, end_page):
2805 page_results = self.getpage(pagenum)
2806 if skip_elems:
2807 page_results = page_results[skip_elems:]
2808 skip_elems = None
2809 if only_more is not None:
2810 if len(page_results) < only_more:
2811 only_more -= len(page_results)
2812 else:
2813 yield from page_results[:only_more]
2814 break
2815 yield from page_results
2816
2817
2818 class PlaylistEntries:
2819 MissingEntry = object()
2820 is_exhausted = False
2821
2822 def __init__(self, ydl, info_dict):
2823 self.ydl, self.info_dict = ydl, info_dict
2824
2825 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2826 (?P<start>[+-]?\d+)?
2827 (?P<range>[:-]
2828 (?P<end>[+-]?\d+|inf(?:inite)?)?
2829 (?::(?P<step>[+-]?\d+))?
2830 )?''')
2831
2832 @classmethod
2833 def parse_playlist_items(cls, string):
2834 for segment in string.split(','):
2835 if not segment:
2836 raise ValueError('There is two or more consecutive commas')
2837 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2838 if not mobj:
2839 raise ValueError(f'{segment!r} is not a valid specification')
2840 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2841 if int_or_none(step) == 0:
2842 raise ValueError(f'Step in {segment!r} cannot be zero')
2843 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2844
2845 def get_requested_items(self):
2846 playlist_items = self.ydl.params.get('playlist_items')
2847 playlist_start = self.ydl.params.get('playliststart', 1)
2848 playlist_end = self.ydl.params.get('playlistend')
2849 # For backwards compatibility, interpret -1 as whole list
2850 if playlist_end in (-1, None):
2851 playlist_end = ''
2852 if not playlist_items:
2853 playlist_items = f'{playlist_start}:{playlist_end}'
2854 elif playlist_start != 1 or playlist_end:
2855 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2856
2857 for index in self.parse_playlist_items(playlist_items):
2858 for i, entry in self[index]:
2859 yield i, entry
2860 try:
2861 # TODO: Add auto-generated fields
2862 self.ydl._match_entry(entry, incomplete=True, silent=True)
2863 except (ExistingVideoReached, RejectedVideoReached):
2864 return
2865
2866 @property
2867 def full_count(self):
2868 if self.info_dict.get('playlist_count'):
2869 return self.info_dict['playlist_count']
2870 elif self.is_exhausted and not self.is_incomplete:
2871 return len(self)
2872 elif isinstance(self._entries, InAdvancePagedList):
2873 if self._entries._pagesize == 1:
2874 return self._entries._pagecount
2875
2876 @functools.cached_property
2877 def _entries(self):
2878 entries = self.info_dict.get('entries')
2879 if entries is None:
2880 raise EntryNotInPlaylist('There are no entries')
2881 elif isinstance(entries, list):
2882 self.is_exhausted = True
2883
2884 indices = self.info_dict.get('requested_entries')
2885 self.is_incomplete = bool(indices)
2886 if self.is_incomplete:
2887 assert self.is_exhausted
2888 ret = [self.MissingEntry] * max(indices)
2889 for i, entry in zip(indices, entries):
2890 ret[i - 1] = entry
2891 return ret
2892
2893 if isinstance(entries, (list, PagedList, LazyList)):
2894 return entries
2895 return LazyList(entries)
2896
2897 @functools.cached_property
2898 def _getter(self):
2899 if isinstance(self._entries, list):
2900 def get_entry(i):
2901 try:
2902 entry = self._entries[i]
2903 except IndexError:
2904 entry = self.MissingEntry
2905 if not self.is_incomplete:
2906 raise self.IndexError()
2907 if entry is self.MissingEntry:
2908 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2909 return entry
2910 else:
2911 def get_entry(i):
2912 try:
2913 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2914 except (LazyList.IndexError, PagedList.IndexError):
2915 raise self.IndexError()
2916 return get_entry
2917
2918 def __getitem__(self, idx):
2919 if isinstance(idx, int):
2920 idx = slice(idx, idx)
2921
2922 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2923 step = 1 if idx.step is None else idx.step
2924 if idx.start is None:
2925 start = 0 if step > 0 else len(self) - 1
2926 else:
2927 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2928
2929 # NB: Do not call len(self) when idx == [:]
2930 if idx.stop is None:
2931 stop = 0 if step < 0 else float('inf')
2932 else:
2933 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2934 stop += [-1, 1][step > 0]
2935
2936 for i in frange(start, stop, step):
2937 if i < 0:
2938 continue
2939 try:
2940 try:
2941 entry = self._getter(i)
2942 except self.IndexError:
2943 self.is_exhausted = True
2944 if step > 0:
2945 break
2946 continue
2947 except IndexError:
2948 if self.is_exhausted:
2949 break
2950 raise
2951 yield i + 1, entry
2952
2953 def __len__(self):
2954 return len(tuple(self[:]))
2955
2956 class IndexError(IndexError):
2957 pass
2958
2959
2960 def uppercase_escape(s):
2961 unicode_escape = codecs.getdecoder('unicode_escape')
2962 return re.sub(
2963 r'\\U[0-9a-fA-F]{8}',
2964 lambda m: unicode_escape(m.group(0))[0],
2965 s)
2966
2967
2968 def lowercase_escape(s):
2969 unicode_escape = codecs.getdecoder('unicode_escape')
2970 return re.sub(
2971 r'\\u[0-9a-fA-F]{4}',
2972 lambda m: unicode_escape(m.group(0))[0],
2973 s)
2974
2975
2976 def escape_rfc3986(s):
2977 """Escape non-ASCII characters as suggested by RFC 3986"""
2978 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2979
2980
2981 def escape_url(url):
2982 """Escape URL as suggested by RFC 3986"""
2983 url_parsed = compat_urllib_parse_urlparse(url)
2984 return url_parsed._replace(
2985 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2986 path=escape_rfc3986(url_parsed.path),
2987 params=escape_rfc3986(url_parsed.params),
2988 query=escape_rfc3986(url_parsed.query),
2989 fragment=escape_rfc3986(url_parsed.fragment)
2990 ).geturl()
2991
2992
2993 def parse_qs(url):
2994 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2995
2996
2997 def read_batch_urls(batch_fd):
2998 def fixup(url):
2999 if not isinstance(url, compat_str):
3000 url = url.decode('utf-8', 'replace')
3001 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3002 for bom in BOM_UTF8:
3003 if url.startswith(bom):
3004 url = url[len(bom):]
3005 url = url.lstrip()
3006 if not url or url.startswith(('#', ';', ']')):
3007 return False
3008 # "#" cannot be stripped out since it is part of the URI
3009 # However, it can be safely stipped out if follwing a whitespace
3010 return re.split(r'\s#', url, 1)[0].rstrip()
3011
3012 with contextlib.closing(batch_fd) as fd:
3013 return [url for url in map(fixup, fd) if url]
3014
3015
3016 def urlencode_postdata(*args, **kargs):
3017 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
3018
3019
3020 def update_url_query(url, query):
3021 if not query:
3022 return url
3023 parsed_url = compat_urlparse.urlparse(url)
3024 qs = compat_parse_qs(parsed_url.query)
3025 qs.update(query)
3026 return compat_urlparse.urlunparse(parsed_url._replace(
3027 query=compat_urllib_parse_urlencode(qs, True)))
3028
3029
3030 def update_Request(req, url=None, data=None, headers={}, query={}):
3031 req_headers = req.headers.copy()
3032 req_headers.update(headers)
3033 req_data = data or req.data
3034 req_url = update_url_query(url or req.get_full_url(), query)
3035 req_get_method = req.get_method()
3036 if req_get_method == 'HEAD':
3037 req_type = HEADRequest
3038 elif req_get_method == 'PUT':
3039 req_type = PUTRequest
3040 else:
3041 req_type = compat_urllib_request.Request
3042 new_req = req_type(
3043 req_url, data=req_data, headers=req_headers,
3044 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3045 if hasattr(req, 'timeout'):
3046 new_req.timeout = req.timeout
3047 return new_req
3048
3049
3050 def _multipart_encode_impl(data, boundary):
3051 content_type = 'multipart/form-data; boundary=%s' % boundary
3052
3053 out = b''
3054 for k, v in data.items():
3055 out += b'--' + boundary.encode('ascii') + b'\r\n'
3056 if isinstance(k, compat_str):
3057 k = k.encode()
3058 if isinstance(v, compat_str):
3059 v = v.encode()
3060 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3061 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3062 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3063 if boundary.encode('ascii') in content:
3064 raise ValueError('Boundary overlaps with data')
3065 out += content
3066
3067 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3068
3069 return out, content_type
3070
3071
3072 def multipart_encode(data, boundary=None):
3073 '''
3074 Encode a dict to RFC 7578-compliant form-data
3075
3076 data:
3077 A dict where keys and values can be either Unicode or bytes-like
3078 objects.
3079 boundary:
3080 If specified a Unicode object, it's used as the boundary. Otherwise
3081 a random boundary is generated.
3082
3083 Reference: https://tools.ietf.org/html/rfc7578
3084 '''
3085 has_specified_boundary = boundary is not None
3086
3087 while True:
3088 if boundary is None:
3089 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3090
3091 try:
3092 out, content_type = _multipart_encode_impl(data, boundary)
3093 break
3094 except ValueError:
3095 if has_specified_boundary:
3096 raise
3097 boundary = None
3098
3099 return out, content_type
3100
3101
3102 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3103 for val in map(d.get, variadic(key_or_keys)):
3104 if val is not None and (val or not skip_false_values):
3105 return val
3106 return default
3107
3108
3109 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3110 for f in funcs:
3111 try:
3112 val = f(*args, **kwargs)
3113 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3114 pass
3115 else:
3116 if expected_type is None or isinstance(val, expected_type):
3117 return val
3118
3119
3120 def try_get(src, getter, expected_type=None):
3121 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3122
3123
3124 def filter_dict(dct, cndn=lambda _, v: v is not None):
3125 return {k: v for k, v in dct.items() if cndn(k, v)}
3126
3127
3128 def merge_dicts(*dicts):
3129 merged = {}
3130 for a_dict in dicts:
3131 for k, v in a_dict.items():
3132 if (v is not None and k not in merged
3133 or isinstance(v, str) and merged[k] == ''):
3134 merged[k] = v
3135 return merged
3136
3137
3138 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3139 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3140
3141
3142 US_RATINGS = {
3143 'G': 0,
3144 'PG': 10,
3145 'PG-13': 13,
3146 'R': 16,
3147 'NC': 18,
3148 }
3149
3150
3151 TV_PARENTAL_GUIDELINES = {
3152 'TV-Y': 0,
3153 'TV-Y7': 7,
3154 'TV-G': 0,
3155 'TV-PG': 0,
3156 'TV-14': 14,
3157 'TV-MA': 17,
3158 }
3159
3160
3161 def parse_age_limit(s):
3162 # isinstance(False, int) is True. So type() must be used instead
3163 if type(s) is int: # noqa: E721
3164 return s if 0 <= s <= 21 else None
3165 elif not isinstance(s, str):
3166 return None
3167 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3168 if m:
3169 return int(m.group('age'))
3170 s = s.upper()
3171 if s in US_RATINGS:
3172 return US_RATINGS[s]
3173 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3174 if m:
3175 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3176 return None
3177
3178
3179 def strip_jsonp(code):
3180 return re.sub(
3181 r'''(?sx)^
3182 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3183 (?:\s*&&\s*(?P=func_name))?
3184 \s*\(\s*(?P<callback_data>.*)\);?
3185 \s*?(?://[^\n]*)*$''',
3186 r'\g<callback_data>', code)
3187
3188
3189 def js_to_json(code, vars={}):
3190 # vars is a dict of var, val pairs to substitute
3191 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3192 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3193 INTEGER_TABLE = (
3194 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3195 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3196 )
3197
3198 def fix_kv(m):
3199 v = m.group(0)
3200 if v in ('true', 'false', 'null'):
3201 return v
3202 elif v in ('undefined', 'void 0'):
3203 return 'null'
3204 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3205 return ""
3206
3207 if v[0] in ("'", '"'):
3208 v = re.sub(r'(?s)\\.|"', lambda m: {
3209 '"': '\\"',
3210 "\\'": "'",
3211 '\\\n': '',
3212 '\\x': '\\u00',
3213 }.get(m.group(0), m.group(0)), v[1:-1])
3214 else:
3215 for regex, base in INTEGER_TABLE:
3216 im = re.match(regex, v)
3217 if im:
3218 i = int(im.group(1), base)
3219 return '"%d":' % i if v.endswith(':') else '%d' % i
3220
3221 if v in vars:
3222 return vars[v]
3223
3224 return '"%s"' % v
3225
3226 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3227
3228 return re.sub(r'''(?sx)
3229 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3230 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3231 {comment}|,(?={skip}[\]}}])|
3232 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3233 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3234 [0-9]+(?={skip}:)|
3235 !+
3236 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3237
3238
3239 def qualities(quality_ids):
3240 """ Get a numeric quality value out of a list of possible values """
3241 def q(qid):
3242 try:
3243 return quality_ids.index(qid)
3244 except ValueError:
3245 return -1
3246 return q
3247
3248
3249 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3250
3251
3252 DEFAULT_OUTTMPL = {
3253 'default': '%(title)s [%(id)s].%(ext)s',
3254 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3255 }
3256 OUTTMPL_TYPES = {
3257 'chapter': None,
3258 'subtitle': None,
3259 'thumbnail': None,
3260 'description': 'description',
3261 'annotation': 'annotations.xml',
3262 'infojson': 'info.json',
3263 'link': None,
3264 'pl_video': None,
3265 'pl_thumbnail': None,
3266 'pl_description': 'description',
3267 'pl_infojson': 'info.json',
3268 }
3269
3270 # As of [1] format syntax is:
3271 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3272 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3273 STR_FORMAT_RE_TMPL = r'''(?x)
3274 (?<!%)(?P<prefix>(?:%%)*)
3275 %
3276 (?P<has_key>\((?P<key>{0})\))?
3277 (?P<format>
3278 (?P<conversion>[#0\-+ ]+)?
3279 (?P<min_width>\d+)?
3280 (?P<precision>\.\d+)?
3281 (?P<len_mod>[hlL])? # unused in python
3282 {1} # conversion type
3283 )
3284 '''
3285
3286
3287 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3288
3289
3290 def limit_length(s, length):
3291 """ Add ellipses to overly long strings """
3292 if s is None:
3293 return None
3294 ELLIPSES = '...'
3295 if len(s) > length:
3296 return s[:length - len(ELLIPSES)] + ELLIPSES
3297 return s
3298
3299
3300 def version_tuple(v):
3301 return tuple(int(e) for e in re.split(r'[-.]', v))
3302
3303
3304 def is_outdated_version(version, limit, assume_new=True):
3305 if not version:
3306 return not assume_new
3307 try:
3308 return version_tuple(version) < version_tuple(limit)
3309 except ValueError:
3310 return not assume_new
3311
3312
3313 def ytdl_is_updateable():
3314 """ Returns if yt-dlp can be updated with -U """
3315
3316 from .update import is_non_updateable
3317
3318 return not is_non_updateable()
3319
3320
3321 def args_to_str(args):
3322 # Get a short string representation for a subprocess command
3323 return ' '.join(compat_shlex_quote(a) for a in args)
3324
3325
3326 def error_to_compat_str(err):
3327 return str(err)
3328
3329
3330 def error_to_str(err):
3331 return f'{type(err).__name__}: {err}'
3332
3333
3334 def mimetype2ext(mt):
3335 if mt is None:
3336 return None
3337
3338 mt, _, params = mt.partition(';')
3339 mt = mt.strip()
3340
3341 FULL_MAP = {
3342 'audio/mp4': 'm4a',
3343 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3344 # it's the most popular one
3345 'audio/mpeg': 'mp3',
3346 'audio/x-wav': 'wav',
3347 'audio/wav': 'wav',
3348 'audio/wave': 'wav',
3349 }
3350
3351 ext = FULL_MAP.get(mt)
3352 if ext is not None:
3353 return ext
3354
3355 SUBTYPE_MAP = {
3356 '3gpp': '3gp',
3357 'smptett+xml': 'tt',
3358 'ttaf+xml': 'dfxp',
3359 'ttml+xml': 'ttml',
3360 'x-flv': 'flv',
3361 'x-mp4-fragmented': 'mp4',
3362 'x-ms-sami': 'sami',
3363 'x-ms-wmv': 'wmv',
3364 'mpegurl': 'm3u8',
3365 'x-mpegurl': 'm3u8',
3366 'vnd.apple.mpegurl': 'm3u8',
3367 'dash+xml': 'mpd',
3368 'f4m+xml': 'f4m',
3369 'hds+xml': 'f4m',
3370 'vnd.ms-sstr+xml': 'ism',
3371 'quicktime': 'mov',
3372 'mp2t': 'ts',
3373 'x-wav': 'wav',
3374 'filmstrip+json': 'fs',
3375 'svg+xml': 'svg',
3376 }
3377
3378 _, _, subtype = mt.rpartition('/')
3379 ext = SUBTYPE_MAP.get(subtype.lower())
3380 if ext is not None:
3381 return ext
3382
3383 SUFFIX_MAP = {
3384 'json': 'json',
3385 'xml': 'xml',
3386 'zip': 'zip',
3387 'gzip': 'gz',
3388 }
3389
3390 _, _, suffix = subtype.partition('+')
3391 ext = SUFFIX_MAP.get(suffix)
3392 if ext is not None:
3393 return ext
3394
3395 return subtype.replace('+', '.')
3396
3397
3398 def ext2mimetype(ext_or_url):
3399 if not ext_or_url:
3400 return None
3401 if '.' not in ext_or_url:
3402 ext_or_url = f'file.{ext_or_url}'
3403 return mimetypes.guess_type(ext_or_url)[0]
3404
3405
3406 def parse_codecs(codecs_str):
3407 # http://tools.ietf.org/html/rfc6381
3408 if not codecs_str:
3409 return {}
3410 split_codecs = list(filter(None, map(
3411 str.strip, codecs_str.strip().strip(',').split(','))))
3412 vcodec, acodec, scodec, hdr = None, None, None, None
3413 for full_codec in split_codecs:
3414 parts = full_codec.split('.')
3415 codec = parts[0].replace('0', '')
3416 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3417 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3418 if not vcodec:
3419 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3420 if codec in ('dvh1', 'dvhe'):
3421 hdr = 'DV'
3422 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3423 hdr = 'HDR10'
3424 elif full_codec.replace('0', '').startswith('vp9.2'):
3425 hdr = 'HDR10'
3426 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3427 if not acodec:
3428 acodec = full_codec
3429 elif codec in ('stpp', 'wvtt',):
3430 if not scodec:
3431 scodec = full_codec
3432 else:
3433 write_string(f'WARNING: Unknown codec {full_codec}\n')
3434 if vcodec or acodec or scodec:
3435 return {
3436 'vcodec': vcodec or 'none',
3437 'acodec': acodec or 'none',
3438 'dynamic_range': hdr,
3439 **({'scodec': scodec} if scodec is not None else {}),
3440 }
3441 elif len(split_codecs) == 2:
3442 return {
3443 'vcodec': split_codecs[0],
3444 'acodec': split_codecs[1],
3445 }
3446 return {}
3447
3448
3449 def urlhandle_detect_ext(url_handle):
3450 getheader = url_handle.headers.get
3451
3452 cd = getheader('Content-Disposition')
3453 if cd:
3454 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3455 if m:
3456 e = determine_ext(m.group('filename'), default_ext=None)
3457 if e:
3458 return e
3459
3460 return mimetype2ext(getheader('Content-Type'))
3461
3462
3463 def encode_data_uri(data, mime_type):
3464 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3465
3466
3467 def age_restricted(content_limit, age_limit):
3468 """ Returns True iff the content should be blocked """
3469
3470 if age_limit is None: # No limit set
3471 return False
3472 if content_limit is None:
3473 return False # Content available for everyone
3474 return age_limit < content_limit
3475
3476
3477 def is_html(first_bytes):
3478 """ Detect whether a file contains HTML by examining its first bytes. """
3479
3480 BOMS = [
3481 (b'\xef\xbb\xbf', 'utf-8'),
3482 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3483 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3484 (b'\xff\xfe', 'utf-16-le'),
3485 (b'\xfe\xff', 'utf-16-be'),
3486 ]
3487
3488 encoding = 'utf-8'
3489 for bom, enc in BOMS:
3490 while first_bytes.startswith(bom):
3491 encoding, first_bytes = enc, first_bytes[len(bom):]
3492
3493 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3494
3495
3496 def determine_protocol(info_dict):
3497 protocol = info_dict.get('protocol')
3498 if protocol is not None:
3499 return protocol
3500
3501 url = sanitize_url(info_dict['url'])
3502 if url.startswith('rtmp'):
3503 return 'rtmp'
3504 elif url.startswith('mms'):
3505 return 'mms'
3506 elif url.startswith('rtsp'):
3507 return 'rtsp'
3508
3509 ext = determine_ext(url)
3510 if ext == 'm3u8':
3511 return 'm3u8'
3512 elif ext == 'f4m':
3513 return 'f4m'
3514
3515 return compat_urllib_parse_urlparse(url).scheme
3516
3517
3518 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3519 """ Render a list of rows, each as a list of values.
3520 Text after a \t will be right aligned """
3521 def width(string):
3522 return len(remove_terminal_sequences(string).replace('\t', ''))
3523
3524 def get_max_lens(table):
3525 return [max(width(str(v)) for v in col) for col in zip(*table)]
3526
3527 def filter_using_list(row, filterArray):
3528 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3529
3530 max_lens = get_max_lens(data) if hide_empty else []
3531 header_row = filter_using_list(header_row, max_lens)
3532 data = [filter_using_list(row, max_lens) for row in data]
3533
3534 table = [header_row] + data
3535 max_lens = get_max_lens(table)
3536 extra_gap += 1
3537 if delim:
3538 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3539 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3540 for row in table:
3541 for pos, text in enumerate(map(str, row)):
3542 if '\t' in text:
3543 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3544 else:
3545 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3546 ret = '\n'.join(''.join(row).rstrip() for row in table)
3547 return ret
3548
3549
3550 def _match_one(filter_part, dct, incomplete):
3551 # TODO: Generalize code with YoutubeDL._build_format_filter
3552 STRING_OPERATORS = {
3553 '*=': operator.contains,
3554 '^=': lambda attr, value: attr.startswith(value),
3555 '$=': lambda attr, value: attr.endswith(value),
3556 '~=': lambda attr, value: re.search(value, attr),
3557 }
3558 COMPARISON_OPERATORS = {
3559 **STRING_OPERATORS,
3560 '<=': operator.le, # "<=" must be defined above "<"
3561 '<': operator.lt,
3562 '>=': operator.ge,
3563 '>': operator.gt,
3564 '=': operator.eq,
3565 }
3566
3567 if isinstance(incomplete, bool):
3568 is_incomplete = lambda _: incomplete
3569 else:
3570 is_incomplete = lambda k: k in incomplete
3571
3572 operator_rex = re.compile(r'''(?x)
3573 (?P<key>[a-z_]+)
3574 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3575 (?:
3576 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3577 (?P<strval>.+?)
3578 )
3579 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3580 m = operator_rex.fullmatch(filter_part.strip())
3581 if m:
3582 m = m.groupdict()
3583 unnegated_op = COMPARISON_OPERATORS[m['op']]
3584 if m['negation']:
3585 op = lambda attr, value: not unnegated_op(attr, value)
3586 else:
3587 op = unnegated_op
3588 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3589 if m['quote']:
3590 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3591 actual_value = dct.get(m['key'])
3592 numeric_comparison = None
3593 if isinstance(actual_value, (int, float)):
3594 # If the original field is a string and matching comparisonvalue is
3595 # a number we should respect the origin of the original field
3596 # and process comparison value as a string (see
3597 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3598 try:
3599 numeric_comparison = int(comparison_value)
3600 except ValueError:
3601 numeric_comparison = parse_filesize(comparison_value)
3602 if numeric_comparison is None:
3603 numeric_comparison = parse_filesize(f'{comparison_value}B')
3604 if numeric_comparison is None:
3605 numeric_comparison = parse_duration(comparison_value)
3606 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3607 raise ValueError('Operator %s only supports string values!' % m['op'])
3608 if actual_value is None:
3609 return is_incomplete(m['key']) or m['none_inclusive']
3610 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3611
3612 UNARY_OPERATORS = {
3613 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3614 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3615 }
3616 operator_rex = re.compile(r'''(?x)
3617 (?P<op>%s)\s*(?P<key>[a-z_]+)
3618 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3619 m = operator_rex.fullmatch(filter_part.strip())
3620 if m:
3621 op = UNARY_OPERATORS[m.group('op')]
3622 actual_value = dct.get(m.group('key'))
3623 if is_incomplete(m.group('key')) and actual_value is None:
3624 return True
3625 return op(actual_value)
3626
3627 raise ValueError('Invalid filter part %r' % filter_part)
3628
3629
3630 def match_str(filter_str, dct, incomplete=False):
3631 """ Filter a dictionary with a simple string syntax.
3632 @returns Whether the filter passes
3633 @param incomplete Set of keys that is expected to be missing from dct.
3634 Can be True/False to indicate all/none of the keys may be missing.
3635 All conditions on incomplete keys pass if the key is missing
3636 """
3637 return all(
3638 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3639 for filter_part in re.split(r'(?<!\\)&', filter_str))
3640
3641
3642 def match_filter_func(filters):
3643 if not filters:
3644 return None
3645 filters = set(variadic(filters))
3646
3647 interactive = '-' in filters
3648 if interactive:
3649 filters.remove('-')
3650
3651 def _match_func(info_dict, incomplete=False):
3652 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3653 return NO_DEFAULT if interactive and not incomplete else None
3654 else:
3655 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3656 filter_str = ') | ('.join(map(str.strip, filters))
3657 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3658 return _match_func
3659
3660
3661 def download_range_func(chapters, ranges):
3662 def inner(info_dict, ydl):
3663 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3664 else 'Cannot match chapters since chapter information is unavailable')
3665 for regex in chapters or []:
3666 for i, chapter in enumerate(info_dict.get('chapters') or []):
3667 if re.search(regex, chapter['title']):
3668 warning = None
3669 yield {**chapter, 'index': i}
3670 if chapters and warning:
3671 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3672
3673 yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3674
3675 return inner
3676
3677
3678 def parse_dfxp_time_expr(time_expr):
3679 if not time_expr:
3680 return
3681
3682 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3683 if mobj:
3684 return float(mobj.group('time_offset'))
3685
3686 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3687 if mobj:
3688 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3689
3690
3691 def srt_subtitles_timecode(seconds):
3692 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3693
3694
3695 def ass_subtitles_timecode(seconds):
3696 time = timetuple_from_msec(seconds * 1000)
3697 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3698
3699
3700 def dfxp2srt(dfxp_data):
3701 '''
3702 @param dfxp_data A bytes-like object containing DFXP data
3703 @returns A unicode object containing converted SRT data
3704 '''
3705 LEGACY_NAMESPACES = (
3706 (b'http://www.w3.org/ns/ttml', [
3707 b'http://www.w3.org/2004/11/ttaf1',
3708 b'http://www.w3.org/2006/04/ttaf1',
3709 b'http://www.w3.org/2006/10/ttaf1',
3710 ]),
3711 (b'http://www.w3.org/ns/ttml#styling', [
3712 b'http://www.w3.org/ns/ttml#style',
3713 ]),
3714 )
3715
3716 SUPPORTED_STYLING = [
3717 'color',
3718 'fontFamily',
3719 'fontSize',
3720 'fontStyle',
3721 'fontWeight',
3722 'textDecoration'
3723 ]
3724
3725 _x = functools.partial(xpath_with_ns, ns_map={
3726 'xml': 'http://www.w3.org/XML/1998/namespace',
3727 'ttml': 'http://www.w3.org/ns/ttml',
3728 'tts': 'http://www.w3.org/ns/ttml#styling',
3729 })
3730
3731 styles = {}
3732 default_style = {}
3733
3734 class TTMLPElementParser:
3735 _out = ''
3736 _unclosed_elements = []
3737 _applied_styles = []
3738
3739 def start(self, tag, attrib):
3740 if tag in (_x('ttml:br'), 'br'):
3741 self._out += '\n'
3742 else:
3743 unclosed_elements = []
3744 style = {}
3745 element_style_id = attrib.get('style')
3746 if default_style:
3747 style.update(default_style)
3748 if element_style_id:
3749 style.update(styles.get(element_style_id, {}))
3750 for prop in SUPPORTED_STYLING:
3751 prop_val = attrib.get(_x('tts:' + prop))
3752 if prop_val:
3753 style[prop] = prop_val
3754 if style:
3755 font = ''
3756 for k, v in sorted(style.items()):
3757 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3758 continue
3759 if k == 'color':
3760 font += ' color="%s"' % v
3761 elif k == 'fontSize':
3762 font += ' size="%s"' % v
3763 elif k == 'fontFamily':
3764 font += ' face="%s"' % v
3765 elif k == 'fontWeight' and v == 'bold':
3766 self._out += '<b>'
3767 unclosed_elements.append('b')
3768 elif k == 'fontStyle' and v == 'italic':
3769 self._out += '<i>'
3770 unclosed_elements.append('i')
3771 elif k == 'textDecoration' and v == 'underline':
3772 self._out += '<u>'
3773 unclosed_elements.append('u')
3774 if font:
3775 self._out += '<font' + font + '>'
3776 unclosed_elements.append('font')
3777 applied_style = {}
3778 if self._applied_styles:
3779 applied_style.update(self._applied_styles[-1])
3780 applied_style.update(style)
3781 self._applied_styles.append(applied_style)
3782 self._unclosed_elements.append(unclosed_elements)
3783
3784 def end(self, tag):
3785 if tag not in (_x('ttml:br'), 'br'):
3786 unclosed_elements = self._unclosed_elements.pop()
3787 for element in reversed(unclosed_elements):
3788 self._out += '</%s>' % element
3789 if unclosed_elements and self._applied_styles:
3790 self._applied_styles.pop()
3791
3792 def data(self, data):
3793 self._out += data
3794
3795 def close(self):
3796 return self._out.strip()
3797
3798 def parse_node(node):
3799 target = TTMLPElementParser()
3800 parser = xml.etree.ElementTree.XMLParser(target=target)
3801 parser.feed(xml.etree.ElementTree.tostring(node))
3802 return parser.close()
3803
3804 for k, v in LEGACY_NAMESPACES:
3805 for ns in v:
3806 dfxp_data = dfxp_data.replace(ns, k)
3807
3808 dfxp = compat_etree_fromstring(dfxp_data)
3809 out = []
3810 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3811
3812 if not paras:
3813 raise ValueError('Invalid dfxp/TTML subtitle')
3814
3815 repeat = False
3816 while True:
3817 for style in dfxp.findall(_x('.//ttml:style')):
3818 style_id = style.get('id') or style.get(_x('xml:id'))
3819 if not style_id:
3820 continue
3821 parent_style_id = style.get('style')
3822 if parent_style_id:
3823 if parent_style_id not in styles:
3824 repeat = True
3825 continue
3826 styles[style_id] = styles[parent_style_id].copy()
3827 for prop in SUPPORTED_STYLING:
3828 prop_val = style.get(_x('tts:' + prop))
3829 if prop_val:
3830 styles.setdefault(style_id, {})[prop] = prop_val
3831 if repeat:
3832 repeat = False
3833 else:
3834 break
3835
3836 for p in ('body', 'div'):
3837 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3838 if ele is None:
3839 continue
3840 style = styles.get(ele.get('style'))
3841 if not style:
3842 continue
3843 default_style.update(style)
3844
3845 for para, index in zip(paras, itertools.count(1)):
3846 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3847 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3848 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3849 if begin_time is None:
3850 continue
3851 if not end_time:
3852 if not dur:
3853 continue
3854 end_time = begin_time + dur
3855 out.append('%d\n%s --> %s\n%s\n\n' % (
3856 index,
3857 srt_subtitles_timecode(begin_time),
3858 srt_subtitles_timecode(end_time),
3859 parse_node(para)))
3860
3861 return ''.join(out)
3862
3863
3864 def cli_option(params, command_option, param, separator=None):
3865 param = params.get(param)
3866 return ([] if param is None
3867 else [command_option, str(param)] if separator is None
3868 else [f'{command_option}{separator}{param}'])
3869
3870
3871 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3872 param = params.get(param)
3873 assert param in (True, False, None)
3874 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3875
3876
3877 def cli_valueless_option(params, command_option, param, expected_value=True):
3878 return [command_option] if params.get(param) == expected_value else []
3879
3880
3881 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3882 if isinstance(argdict, (list, tuple)): # for backward compatibility
3883 if use_compat:
3884 return argdict
3885 else:
3886 argdict = None
3887 if argdict is None:
3888 return default
3889 assert isinstance(argdict, dict)
3890
3891 assert isinstance(keys, (list, tuple))
3892 for key_list in keys:
3893 arg_list = list(filter(
3894 lambda x: x is not None,
3895 [argdict.get(key.lower()) for key in variadic(key_list)]))
3896 if arg_list:
3897 return [arg for args in arg_list for arg in args]
3898 return default
3899
3900
3901 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3902 main_key, exe = main_key.lower(), exe.lower()
3903 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3904 keys = [f'{root_key}{k}' for k in (keys or [''])]
3905 if root_key in keys:
3906 if main_key != exe:
3907 keys.append((main_key, exe))
3908 keys.append('default')
3909 else:
3910 use_compat = False
3911 return cli_configuration_args(argdict, keys, default, use_compat)
3912
3913
3914 class ISO639Utils:
3915 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3916 _lang_map = {
3917 'aa': 'aar',
3918 'ab': 'abk',
3919 'ae': 'ave',
3920 'af': 'afr',
3921 'ak': 'aka',
3922 'am': 'amh',
3923 'an': 'arg',
3924 'ar': 'ara',
3925 'as': 'asm',
3926 'av': 'ava',
3927 'ay': 'aym',
3928 'az': 'aze',
3929 'ba': 'bak',
3930 'be': 'bel',
3931 'bg': 'bul',
3932 'bh': 'bih',
3933 'bi': 'bis',
3934 'bm': 'bam',
3935 'bn': 'ben',
3936 'bo': 'bod',
3937 'br': 'bre',
3938 'bs': 'bos',
3939 'ca': 'cat',
3940 'ce': 'che',
3941 'ch': 'cha',
3942 'co': 'cos',
3943 'cr': 'cre',
3944 'cs': 'ces',
3945 'cu': 'chu',
3946 'cv': 'chv',
3947 'cy': 'cym',
3948 'da': 'dan',
3949 'de': 'deu',
3950 'dv': 'div',
3951 'dz': 'dzo',
3952 'ee': 'ewe',
3953 'el': 'ell',
3954 'en': 'eng',
3955 'eo': 'epo',
3956 'es': 'spa',
3957 'et': 'est',
3958 'eu': 'eus',
3959 'fa': 'fas',
3960 'ff': 'ful',
3961 'fi': 'fin',
3962 'fj': 'fij',
3963 'fo': 'fao',
3964 'fr': 'fra',
3965 'fy': 'fry',
3966 'ga': 'gle',
3967 'gd': 'gla',
3968 'gl': 'glg',
3969 'gn': 'grn',
3970 'gu': 'guj',
3971 'gv': 'glv',
3972 'ha': 'hau',
3973 'he': 'heb',
3974 'iw': 'heb', # Replaced by he in 1989 revision
3975 'hi': 'hin',
3976 'ho': 'hmo',
3977 'hr': 'hrv',
3978 'ht': 'hat',
3979 'hu': 'hun',
3980 'hy': 'hye',
3981 'hz': 'her',
3982 'ia': 'ina',
3983 'id': 'ind',
3984 'in': 'ind', # Replaced by id in 1989 revision
3985 'ie': 'ile',
3986 'ig': 'ibo',
3987 'ii': 'iii',
3988 'ik': 'ipk',
3989 'io': 'ido',
3990 'is': 'isl',
3991 'it': 'ita',
3992 'iu': 'iku',
3993 'ja': 'jpn',
3994 'jv': 'jav',
3995 'ka': 'kat',
3996 'kg': 'kon',
3997 'ki': 'kik',
3998 'kj': 'kua',
3999 'kk': 'kaz',
4000 'kl': 'kal',
4001 'km': 'khm',
4002 'kn': 'kan',
4003 'ko': 'kor',
4004 'kr': 'kau',
4005 'ks': 'kas',
4006 'ku': 'kur',
4007 'kv': 'kom',
4008 'kw': 'cor',
4009 'ky': 'kir',
4010 'la': 'lat',
4011 'lb': 'ltz',
4012 'lg': 'lug',
4013 'li': 'lim',
4014 'ln': 'lin',
4015 'lo': 'lao',
4016 'lt': 'lit',
4017 'lu': 'lub',
4018 'lv': 'lav',
4019 'mg': 'mlg',
4020 'mh': 'mah',
4021 'mi': 'mri',
4022 'mk': 'mkd',
4023 'ml': 'mal',
4024 'mn': 'mon',
4025 'mr': 'mar',
4026 'ms': 'msa',
4027 'mt': 'mlt',
4028 'my': 'mya',
4029 'na': 'nau',
4030 'nb': 'nob',
4031 'nd': 'nde',
4032 'ne': 'nep',
4033 'ng': 'ndo',
4034 'nl': 'nld',
4035 'nn': 'nno',
4036 'no': 'nor',
4037 'nr': 'nbl',
4038 'nv': 'nav',
4039 'ny': 'nya',
4040 'oc': 'oci',
4041 'oj': 'oji',
4042 'om': 'orm',
4043 'or': 'ori',
4044 'os': 'oss',
4045 'pa': 'pan',
4046 'pi': 'pli',
4047 'pl': 'pol',
4048 'ps': 'pus',
4049 'pt': 'por',
4050 'qu': 'que',
4051 'rm': 'roh',
4052 'rn': 'run',
4053 'ro': 'ron',
4054 'ru': 'rus',
4055 'rw': 'kin',
4056 'sa': 'san',
4057 'sc': 'srd',
4058 'sd': 'snd',
4059 'se': 'sme',
4060 'sg': 'sag',
4061 'si': 'sin',
4062 'sk': 'slk',
4063 'sl': 'slv',
4064 'sm': 'smo',
4065 'sn': 'sna',
4066 'so': 'som',
4067 'sq': 'sqi',
4068 'sr': 'srp',
4069 'ss': 'ssw',
4070 'st': 'sot',
4071 'su': 'sun',
4072 'sv': 'swe',
4073 'sw': 'swa',
4074 'ta': 'tam',
4075 'te': 'tel',
4076 'tg': 'tgk',
4077 'th': 'tha',
4078 'ti': 'tir',
4079 'tk': 'tuk',
4080 'tl': 'tgl',
4081 'tn': 'tsn',
4082 'to': 'ton',
4083 'tr': 'tur',
4084 'ts': 'tso',
4085 'tt': 'tat',
4086 'tw': 'twi',
4087 'ty': 'tah',
4088 'ug': 'uig',
4089 'uk': 'ukr',
4090 'ur': 'urd',
4091 'uz': 'uzb',
4092 've': 'ven',
4093 'vi': 'vie',
4094 'vo': 'vol',
4095 'wa': 'wln',
4096 'wo': 'wol',
4097 'xh': 'xho',
4098 'yi': 'yid',
4099 'ji': 'yid', # Replaced by yi in 1989 revision
4100 'yo': 'yor',
4101 'za': 'zha',
4102 'zh': 'zho',
4103 'zu': 'zul',
4104 }
4105
4106 @classmethod
4107 def short2long(cls, code):
4108 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4109 return cls._lang_map.get(code[:2])
4110
4111 @classmethod
4112 def long2short(cls, code):
4113 """Convert language code from ISO 639-2/T to ISO 639-1"""
4114 for short_name, long_name in cls._lang_map.items():
4115 if long_name == code:
4116 return short_name
4117
4118
4119 class ISO3166Utils:
4120 # From http://data.okfn.org/data/core/country-list
4121 _country_map = {
4122 'AF': 'Afghanistan',
4123 'AX': 'Åland Islands',
4124 'AL': 'Albania',
4125 'DZ': 'Algeria',
4126 'AS': 'American Samoa',
4127 'AD': 'Andorra',
4128 'AO': 'Angola',
4129 'AI': 'Anguilla',
4130 'AQ': 'Antarctica',
4131 'AG': 'Antigua and Barbuda',
4132 'AR': 'Argentina',
4133 'AM': 'Armenia',
4134 'AW': 'Aruba',
4135 'AU': 'Australia',
4136 'AT': 'Austria',
4137 'AZ': 'Azerbaijan',
4138 'BS': 'Bahamas',
4139 'BH': 'Bahrain',
4140 'BD': 'Bangladesh',
4141 'BB': 'Barbados',
4142 'BY': 'Belarus',
4143 'BE': 'Belgium',
4144 'BZ': 'Belize',
4145 'BJ': 'Benin',
4146 'BM': 'Bermuda',
4147 'BT': 'Bhutan',
4148 'BO': 'Bolivia, Plurinational State of',
4149 'BQ': 'Bonaire, Sint Eustatius and Saba',
4150 'BA': 'Bosnia and Herzegovina',
4151 'BW': 'Botswana',
4152 'BV': 'Bouvet Island',
4153 'BR': 'Brazil',
4154 'IO': 'British Indian Ocean Territory',
4155 'BN': 'Brunei Darussalam',
4156 'BG': 'Bulgaria',
4157 'BF': 'Burkina Faso',
4158 'BI': 'Burundi',
4159 'KH': 'Cambodia',
4160 'CM': 'Cameroon',
4161 'CA': 'Canada',
4162 'CV': 'Cape Verde',
4163 'KY': 'Cayman Islands',
4164 'CF': 'Central African Republic',
4165 'TD': 'Chad',
4166 'CL': 'Chile',
4167 'CN': 'China',
4168 'CX': 'Christmas Island',
4169 'CC': 'Cocos (Keeling) Islands',
4170 'CO': 'Colombia',
4171 'KM': 'Comoros',
4172 'CG': 'Congo',
4173 'CD': 'Congo, the Democratic Republic of the',
4174 'CK': 'Cook Islands',
4175 'CR': 'Costa Rica',
4176 'CI': 'Côte d\'Ivoire',
4177 'HR': 'Croatia',
4178 'CU': 'Cuba',
4179 'CW': 'Curaçao',
4180 'CY': 'Cyprus',
4181 'CZ': 'Czech Republic',
4182 'DK': 'Denmark',
4183 'DJ': 'Djibouti',
4184 'DM': 'Dominica',
4185 'DO': 'Dominican Republic',
4186 'EC': 'Ecuador',
4187 'EG': 'Egypt',
4188 'SV': 'El Salvador',
4189 'GQ': 'Equatorial Guinea',
4190 'ER': 'Eritrea',
4191 'EE': 'Estonia',
4192 'ET': 'Ethiopia',
4193 'FK': 'Falkland Islands (Malvinas)',
4194 'FO': 'Faroe Islands',
4195 'FJ': 'Fiji',
4196 'FI': 'Finland',
4197 'FR': 'France',
4198 'GF': 'French Guiana',
4199 'PF': 'French Polynesia',
4200 'TF': 'French Southern Territories',
4201 'GA': 'Gabon',
4202 'GM': 'Gambia',
4203 'GE': 'Georgia',
4204 'DE': 'Germany',
4205 'GH': 'Ghana',
4206 'GI': 'Gibraltar',
4207 'GR': 'Greece',
4208 'GL': 'Greenland',
4209 'GD': 'Grenada',
4210 'GP': 'Guadeloupe',
4211 'GU': 'Guam',
4212 'GT': 'Guatemala',
4213 'GG': 'Guernsey',
4214 'GN': 'Guinea',
4215 'GW': 'Guinea-Bissau',
4216 'GY': 'Guyana',
4217 'HT': 'Haiti',
4218 'HM': 'Heard Island and McDonald Islands',
4219 'VA': 'Holy See (Vatican City State)',
4220 'HN': 'Honduras',
4221 'HK': 'Hong Kong',
4222 'HU': 'Hungary',
4223 'IS': 'Iceland',
4224 'IN': 'India',
4225 'ID': 'Indonesia',
4226 'IR': 'Iran, Islamic Republic of',
4227 'IQ': 'Iraq',
4228 'IE': 'Ireland',
4229 'IM': 'Isle of Man',
4230 'IL': 'Israel',
4231 'IT': 'Italy',
4232 'JM': 'Jamaica',
4233 'JP': 'Japan',
4234 'JE': 'Jersey',
4235 'JO': 'Jordan',
4236 'KZ': 'Kazakhstan',
4237 'KE': 'Kenya',
4238 'KI': 'Kiribati',
4239 'KP': 'Korea, Democratic People\'s Republic of',
4240 'KR': 'Korea, Republic of',
4241 'KW': 'Kuwait',
4242 'KG': 'Kyrgyzstan',
4243 'LA': 'Lao People\'s Democratic Republic',
4244 'LV': 'Latvia',
4245 'LB': 'Lebanon',
4246 'LS': 'Lesotho',
4247 'LR': 'Liberia',
4248 'LY': 'Libya',
4249 'LI': 'Liechtenstein',
4250 'LT': 'Lithuania',
4251 'LU': 'Luxembourg',
4252 'MO': 'Macao',
4253 'MK': 'Macedonia, the Former Yugoslav Republic of',
4254 'MG': 'Madagascar',
4255 'MW': 'Malawi',
4256 'MY': 'Malaysia',
4257 'MV': 'Maldives',
4258 'ML': 'Mali',
4259 'MT': 'Malta',
4260 'MH': 'Marshall Islands',
4261 'MQ': 'Martinique',
4262 'MR': 'Mauritania',
4263 'MU': 'Mauritius',
4264 'YT': 'Mayotte',
4265 'MX': 'Mexico',
4266 'FM': 'Micronesia, Federated States of',
4267 'MD': 'Moldova, Republic of',
4268 'MC': 'Monaco',
4269 'MN': 'Mongolia',
4270 'ME': 'Montenegro',
4271 'MS': 'Montserrat',
4272 'MA': 'Morocco',
4273 'MZ': 'Mozambique',
4274 'MM': 'Myanmar',
4275 'NA': 'Namibia',
4276 'NR': 'Nauru',
4277 'NP': 'Nepal',
4278 'NL': 'Netherlands',
4279 'NC': 'New Caledonia',
4280 'NZ': 'New Zealand',
4281 'NI': 'Nicaragua',
4282 'NE': 'Niger',
4283 'NG': 'Nigeria',
4284 'NU': 'Niue',
4285 'NF': 'Norfolk Island',
4286 'MP': 'Northern Mariana Islands',
4287 'NO': 'Norway',
4288 'OM': 'Oman',
4289 'PK': 'Pakistan',
4290 'PW': 'Palau',
4291 'PS': 'Palestine, State of',
4292 'PA': 'Panama',
4293 'PG': 'Papua New Guinea',
4294 'PY': 'Paraguay',
4295 'PE': 'Peru',
4296 'PH': 'Philippines',
4297 'PN': 'Pitcairn',
4298 'PL': 'Poland',
4299 'PT': 'Portugal',
4300 'PR': 'Puerto Rico',
4301 'QA': 'Qatar',
4302 'RE': 'Réunion',
4303 'RO': 'Romania',
4304 'RU': 'Russian Federation',
4305 'RW': 'Rwanda',
4306 'BL': 'Saint Barthélemy',
4307 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4308 'KN': 'Saint Kitts and Nevis',
4309 'LC': 'Saint Lucia',
4310 'MF': 'Saint Martin (French part)',
4311 'PM': 'Saint Pierre and Miquelon',
4312 'VC': 'Saint Vincent and the Grenadines',
4313 'WS': 'Samoa',
4314 'SM': 'San Marino',
4315 'ST': 'Sao Tome and Principe',
4316 'SA': 'Saudi Arabia',
4317 'SN': 'Senegal',
4318 'RS': 'Serbia',
4319 'SC': 'Seychelles',
4320 'SL': 'Sierra Leone',
4321 'SG': 'Singapore',
4322 'SX': 'Sint Maarten (Dutch part)',
4323 'SK': 'Slovakia',
4324 'SI': 'Slovenia',
4325 'SB': 'Solomon Islands',
4326 'SO': 'Somalia',
4327 'ZA': 'South Africa',
4328 'GS': 'South Georgia and the South Sandwich Islands',
4329 'SS': 'South Sudan',
4330 'ES': 'Spain',
4331 'LK': 'Sri Lanka',
4332 'SD': 'Sudan',
4333 'SR': 'Suriname',
4334 'SJ': 'Svalbard and Jan Mayen',
4335 'SZ': 'Swaziland',
4336 'SE': 'Sweden',
4337 'CH': 'Switzerland',
4338 'SY': 'Syrian Arab Republic',
4339 'TW': 'Taiwan, Province of China',
4340 'TJ': 'Tajikistan',
4341 'TZ': 'Tanzania, United Republic of',
4342 'TH': 'Thailand',
4343 'TL': 'Timor-Leste',
4344 'TG': 'Togo',
4345 'TK': 'Tokelau',
4346 'TO': 'Tonga',
4347 'TT': 'Trinidad and Tobago',
4348 'TN': 'Tunisia',
4349 'TR': 'Turkey',
4350 'TM': 'Turkmenistan',
4351 'TC': 'Turks and Caicos Islands',
4352 'TV': 'Tuvalu',
4353 'UG': 'Uganda',
4354 'UA': 'Ukraine',
4355 'AE': 'United Arab Emirates',
4356 'GB': 'United Kingdom',
4357 'US': 'United States',
4358 'UM': 'United States Minor Outlying Islands',
4359 'UY': 'Uruguay',
4360 'UZ': 'Uzbekistan',
4361 'VU': 'Vanuatu',
4362 'VE': 'Venezuela, Bolivarian Republic of',
4363 'VN': 'Viet Nam',
4364 'VG': 'Virgin Islands, British',
4365 'VI': 'Virgin Islands, U.S.',
4366 'WF': 'Wallis and Futuna',
4367 'EH': 'Western Sahara',
4368 'YE': 'Yemen',
4369 'ZM': 'Zambia',
4370 'ZW': 'Zimbabwe',
4371 # Not ISO 3166 codes, but used for IP blocks
4372 'AP': 'Asia/Pacific Region',
4373 'EU': 'Europe',
4374 }
4375
4376 @classmethod
4377 def short2full(cls, code):
4378 """Convert an ISO 3166-2 country code to the corresponding full name"""
4379 return cls._country_map.get(code.upper())
4380
4381
4382 class GeoUtils:
4383 # Major IPv4 address blocks per country
4384 _country_ip_map = {
4385 'AD': '46.172.224.0/19',
4386 'AE': '94.200.0.0/13',
4387 'AF': '149.54.0.0/17',
4388 'AG': '209.59.64.0/18',
4389 'AI': '204.14.248.0/21',
4390 'AL': '46.99.0.0/16',
4391 'AM': '46.70.0.0/15',
4392 'AO': '105.168.0.0/13',
4393 'AP': '182.50.184.0/21',
4394 'AQ': '23.154.160.0/24',
4395 'AR': '181.0.0.0/12',
4396 'AS': '202.70.112.0/20',
4397 'AT': '77.116.0.0/14',
4398 'AU': '1.128.0.0/11',
4399 'AW': '181.41.0.0/18',
4400 'AX': '185.217.4.0/22',
4401 'AZ': '5.197.0.0/16',
4402 'BA': '31.176.128.0/17',
4403 'BB': '65.48.128.0/17',
4404 'BD': '114.130.0.0/16',
4405 'BE': '57.0.0.0/8',
4406 'BF': '102.178.0.0/15',
4407 'BG': '95.42.0.0/15',
4408 'BH': '37.131.0.0/17',
4409 'BI': '154.117.192.0/18',
4410 'BJ': '137.255.0.0/16',
4411 'BL': '185.212.72.0/23',
4412 'BM': '196.12.64.0/18',
4413 'BN': '156.31.0.0/16',
4414 'BO': '161.56.0.0/16',
4415 'BQ': '161.0.80.0/20',
4416 'BR': '191.128.0.0/12',
4417 'BS': '24.51.64.0/18',
4418 'BT': '119.2.96.0/19',
4419 'BW': '168.167.0.0/16',
4420 'BY': '178.120.0.0/13',
4421 'BZ': '179.42.192.0/18',
4422 'CA': '99.224.0.0/11',
4423 'CD': '41.243.0.0/16',
4424 'CF': '197.242.176.0/21',
4425 'CG': '160.113.0.0/16',
4426 'CH': '85.0.0.0/13',
4427 'CI': '102.136.0.0/14',
4428 'CK': '202.65.32.0/19',
4429 'CL': '152.172.0.0/14',
4430 'CM': '102.244.0.0/14',
4431 'CN': '36.128.0.0/10',
4432 'CO': '181.240.0.0/12',
4433 'CR': '201.192.0.0/12',
4434 'CU': '152.206.0.0/15',
4435 'CV': '165.90.96.0/19',
4436 'CW': '190.88.128.0/17',
4437 'CY': '31.153.0.0/16',
4438 'CZ': '88.100.0.0/14',
4439 'DE': '53.0.0.0/8',
4440 'DJ': '197.241.0.0/17',
4441 'DK': '87.48.0.0/12',
4442 'DM': '192.243.48.0/20',
4443 'DO': '152.166.0.0/15',
4444 'DZ': '41.96.0.0/12',
4445 'EC': '186.68.0.0/15',
4446 'EE': '90.190.0.0/15',
4447 'EG': '156.160.0.0/11',
4448 'ER': '196.200.96.0/20',
4449 'ES': '88.0.0.0/11',
4450 'ET': '196.188.0.0/14',
4451 'EU': '2.16.0.0/13',
4452 'FI': '91.152.0.0/13',
4453 'FJ': '144.120.0.0/16',
4454 'FK': '80.73.208.0/21',
4455 'FM': '119.252.112.0/20',
4456 'FO': '88.85.32.0/19',
4457 'FR': '90.0.0.0/9',
4458 'GA': '41.158.0.0/15',
4459 'GB': '25.0.0.0/8',
4460 'GD': '74.122.88.0/21',
4461 'GE': '31.146.0.0/16',
4462 'GF': '161.22.64.0/18',
4463 'GG': '62.68.160.0/19',
4464 'GH': '154.160.0.0/12',
4465 'GI': '95.164.0.0/16',
4466 'GL': '88.83.0.0/19',
4467 'GM': '160.182.0.0/15',
4468 'GN': '197.149.192.0/18',
4469 'GP': '104.250.0.0/19',
4470 'GQ': '105.235.224.0/20',
4471 'GR': '94.64.0.0/13',
4472 'GT': '168.234.0.0/16',
4473 'GU': '168.123.0.0/16',
4474 'GW': '197.214.80.0/20',
4475 'GY': '181.41.64.0/18',
4476 'HK': '113.252.0.0/14',
4477 'HN': '181.210.0.0/16',
4478 'HR': '93.136.0.0/13',
4479 'HT': '148.102.128.0/17',
4480 'HU': '84.0.0.0/14',
4481 'ID': '39.192.0.0/10',
4482 'IE': '87.32.0.0/12',
4483 'IL': '79.176.0.0/13',
4484 'IM': '5.62.80.0/20',
4485 'IN': '117.192.0.0/10',
4486 'IO': '203.83.48.0/21',
4487 'IQ': '37.236.0.0/14',
4488 'IR': '2.176.0.0/12',
4489 'IS': '82.221.0.0/16',
4490 'IT': '79.0.0.0/10',
4491 'JE': '87.244.64.0/18',
4492 'JM': '72.27.0.0/17',
4493 'JO': '176.29.0.0/16',
4494 'JP': '133.0.0.0/8',
4495 'KE': '105.48.0.0/12',
4496 'KG': '158.181.128.0/17',
4497 'KH': '36.37.128.0/17',
4498 'KI': '103.25.140.0/22',
4499 'KM': '197.255.224.0/20',
4500 'KN': '198.167.192.0/19',
4501 'KP': '175.45.176.0/22',
4502 'KR': '175.192.0.0/10',
4503 'KW': '37.36.0.0/14',
4504 'KY': '64.96.0.0/15',
4505 'KZ': '2.72.0.0/13',
4506 'LA': '115.84.64.0/18',
4507 'LB': '178.135.0.0/16',
4508 'LC': '24.92.144.0/20',
4509 'LI': '82.117.0.0/19',
4510 'LK': '112.134.0.0/15',
4511 'LR': '102.183.0.0/16',
4512 'LS': '129.232.0.0/17',
4513 'LT': '78.56.0.0/13',
4514 'LU': '188.42.0.0/16',
4515 'LV': '46.109.0.0/16',
4516 'LY': '41.252.0.0/14',
4517 'MA': '105.128.0.0/11',
4518 'MC': '88.209.64.0/18',
4519 'MD': '37.246.0.0/16',
4520 'ME': '178.175.0.0/17',
4521 'MF': '74.112.232.0/21',
4522 'MG': '154.126.0.0/17',
4523 'MH': '117.103.88.0/21',
4524 'MK': '77.28.0.0/15',
4525 'ML': '154.118.128.0/18',
4526 'MM': '37.111.0.0/17',
4527 'MN': '49.0.128.0/17',
4528 'MO': '60.246.0.0/16',
4529 'MP': '202.88.64.0/20',
4530 'MQ': '109.203.224.0/19',
4531 'MR': '41.188.64.0/18',
4532 'MS': '208.90.112.0/22',
4533 'MT': '46.11.0.0/16',
4534 'MU': '105.16.0.0/12',
4535 'MV': '27.114.128.0/18',
4536 'MW': '102.70.0.0/15',
4537 'MX': '187.192.0.0/11',
4538 'MY': '175.136.0.0/13',
4539 'MZ': '197.218.0.0/15',
4540 'NA': '41.182.0.0/16',
4541 'NC': '101.101.0.0/18',
4542 'NE': '197.214.0.0/18',
4543 'NF': '203.17.240.0/22',
4544 'NG': '105.112.0.0/12',
4545 'NI': '186.76.0.0/15',
4546 'NL': '145.96.0.0/11',
4547 'NO': '84.208.0.0/13',
4548 'NP': '36.252.0.0/15',
4549 'NR': '203.98.224.0/19',
4550 'NU': '49.156.48.0/22',
4551 'NZ': '49.224.0.0/14',
4552 'OM': '5.36.0.0/15',
4553 'PA': '186.72.0.0/15',
4554 'PE': '186.160.0.0/14',
4555 'PF': '123.50.64.0/18',
4556 'PG': '124.240.192.0/19',
4557 'PH': '49.144.0.0/13',
4558 'PK': '39.32.0.0/11',
4559 'PL': '83.0.0.0/11',
4560 'PM': '70.36.0.0/20',
4561 'PR': '66.50.0.0/16',
4562 'PS': '188.161.0.0/16',
4563 'PT': '85.240.0.0/13',
4564 'PW': '202.124.224.0/20',
4565 'PY': '181.120.0.0/14',
4566 'QA': '37.210.0.0/15',
4567 'RE': '102.35.0.0/16',
4568 'RO': '79.112.0.0/13',
4569 'RS': '93.86.0.0/15',
4570 'RU': '5.136.0.0/13',
4571 'RW': '41.186.0.0/16',
4572 'SA': '188.48.0.0/13',
4573 'SB': '202.1.160.0/19',
4574 'SC': '154.192.0.0/11',
4575 'SD': '102.120.0.0/13',
4576 'SE': '78.64.0.0/12',
4577 'SG': '8.128.0.0/10',
4578 'SI': '188.196.0.0/14',
4579 'SK': '78.98.0.0/15',
4580 'SL': '102.143.0.0/17',
4581 'SM': '89.186.32.0/19',
4582 'SN': '41.82.0.0/15',
4583 'SO': '154.115.192.0/18',
4584 'SR': '186.179.128.0/17',
4585 'SS': '105.235.208.0/21',
4586 'ST': '197.159.160.0/19',
4587 'SV': '168.243.0.0/16',
4588 'SX': '190.102.0.0/20',
4589 'SY': '5.0.0.0/16',
4590 'SZ': '41.84.224.0/19',
4591 'TC': '65.255.48.0/20',
4592 'TD': '154.68.128.0/19',
4593 'TG': '196.168.0.0/14',
4594 'TH': '171.96.0.0/13',
4595 'TJ': '85.9.128.0/18',
4596 'TK': '27.96.24.0/21',
4597 'TL': '180.189.160.0/20',
4598 'TM': '95.85.96.0/19',
4599 'TN': '197.0.0.0/11',
4600 'TO': '175.176.144.0/21',
4601 'TR': '78.160.0.0/11',
4602 'TT': '186.44.0.0/15',
4603 'TV': '202.2.96.0/19',
4604 'TW': '120.96.0.0/11',
4605 'TZ': '156.156.0.0/14',
4606 'UA': '37.52.0.0/14',
4607 'UG': '102.80.0.0/13',
4608 'US': '6.0.0.0/8',
4609 'UY': '167.56.0.0/13',
4610 'UZ': '84.54.64.0/18',
4611 'VA': '212.77.0.0/19',
4612 'VC': '207.191.240.0/21',
4613 'VE': '186.88.0.0/13',
4614 'VG': '66.81.192.0/20',
4615 'VI': '146.226.0.0/16',
4616 'VN': '14.160.0.0/11',
4617 'VU': '202.80.32.0/20',
4618 'WF': '117.20.32.0/21',
4619 'WS': '202.4.32.0/19',
4620 'YE': '134.35.0.0/16',
4621 'YT': '41.242.116.0/22',
4622 'ZA': '41.0.0.0/11',
4623 'ZM': '102.144.0.0/13',
4624 'ZW': '102.177.192.0/18',
4625 }
4626
4627 @classmethod
4628 def random_ipv4(cls, code_or_block):
4629 if len(code_or_block) == 2:
4630 block = cls._country_ip_map.get(code_or_block.upper())
4631 if not block:
4632 return None
4633 else:
4634 block = code_or_block
4635 addr, preflen = block.split('/')
4636 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4637 addr_max = addr_min | (0xffffffff >> int(preflen))
4638 return compat_str(socket.inet_ntoa(
4639 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4640
4641
4642 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4643 def __init__(self, proxies=None):
4644 # Set default handlers
4645 for type in ('http', 'https'):
4646 setattr(self, '%s_open' % type,
4647 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4648 meth(r, proxy, type))
4649 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4650
4651 def proxy_open(self, req, proxy, type):
4652 req_proxy = req.headers.get('Ytdl-request-proxy')
4653 if req_proxy is not None:
4654 proxy = req_proxy
4655 del req.headers['Ytdl-request-proxy']
4656
4657 if proxy == '__noproxy__':
4658 return None # No Proxy
4659 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4660 req.add_header('Ytdl-socks-proxy', proxy)
4661 # yt-dlp's http/https handlers do wrapping the socket with socks
4662 return None
4663 return compat_urllib_request.ProxyHandler.proxy_open(
4664 self, req, proxy, type)
4665
4666
4667 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4668 # released into Public Domain
4669 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4670
4671 def long_to_bytes(n, blocksize=0):
4672 """long_to_bytes(n:long, blocksize:int) : string
4673 Convert a long integer to a byte string.
4674
4675 If optional blocksize is given and greater than zero, pad the front of the
4676 byte string with binary zeros so that the length is a multiple of
4677 blocksize.
4678 """
4679 # after much testing, this algorithm was deemed to be the fastest
4680 s = b''
4681 n = int(n)
4682 while n > 0:
4683 s = compat_struct_pack('>I', n & 0xffffffff) + s
4684 n = n >> 32
4685 # strip off leading zeros
4686 for i in range(len(s)):
4687 if s[i] != b'\000'[0]:
4688 break
4689 else:
4690 # only happens when n == 0
4691 s = b'\000'
4692 i = 0
4693 s = s[i:]
4694 # add back some pad bytes. this could be done more efficiently w.r.t. the
4695 # de-padding being done above, but sigh...
4696 if blocksize > 0 and len(s) % blocksize:
4697 s = (blocksize - len(s) % blocksize) * b'\000' + s
4698 return s
4699
4700
4701 def bytes_to_long(s):
4702 """bytes_to_long(string) : long
4703 Convert a byte string to a long integer.
4704
4705 This is (essentially) the inverse of long_to_bytes().
4706 """
4707 acc = 0
4708 length = len(s)
4709 if length % 4:
4710 extra = (4 - length % 4)
4711 s = b'\000' * extra + s
4712 length = length + extra
4713 for i in range(0, length, 4):
4714 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4715 return acc
4716
4717
4718 def ohdave_rsa_encrypt(data, exponent, modulus):
4719 '''
4720 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4721
4722 Input:
4723 data: data to encrypt, bytes-like object
4724 exponent, modulus: parameter e and N of RSA algorithm, both integer
4725 Output: hex string of encrypted data
4726
4727 Limitation: supports one block encryption only
4728 '''
4729
4730 payload = int(binascii.hexlify(data[::-1]), 16)
4731 encrypted = pow(payload, exponent, modulus)
4732 return '%x' % encrypted
4733
4734
4735 def pkcs1pad(data, length):
4736 """
4737 Padding input data with PKCS#1 scheme
4738
4739 @param {int[]} data input data
4740 @param {int} length target length
4741 @returns {int[]} padded data
4742 """
4743 if len(data) > length - 11:
4744 raise ValueError('Input data too long for PKCS#1 padding')
4745
4746 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4747 return [0, 2] + pseudo_random + [0] + data
4748
4749
4750 def encode_base_n(num, n, table=None):
4751 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4752 if not table:
4753 table = FULL_TABLE[:n]
4754
4755 if n > len(table):
4756 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4757
4758 if num == 0:
4759 return table[0]
4760
4761 ret = ''
4762 while num:
4763 ret = table[num % n] + ret
4764 num = num // n
4765 return ret
4766
4767
4768 def decode_packed_codes(code):
4769 mobj = re.search(PACKED_CODES_RE, code)
4770 obfuscated_code, base, count, symbols = mobj.groups()
4771 base = int(base)
4772 count = int(count)
4773 symbols = symbols.split('|')
4774 symbol_table = {}
4775
4776 while count:
4777 count -= 1
4778 base_n_count = encode_base_n(count, base)
4779 symbol_table[base_n_count] = symbols[count] or base_n_count
4780
4781 return re.sub(
4782 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4783 obfuscated_code)
4784
4785
4786 def caesar(s, alphabet, shift):
4787 if shift == 0:
4788 return s
4789 l = len(alphabet)
4790 return ''.join(
4791 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4792 for c in s)
4793
4794
4795 def rot47(s):
4796 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4797
4798
4799 def parse_m3u8_attributes(attrib):
4800 info = {}
4801 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4802 if val.startswith('"'):
4803 val = val[1:-1]
4804 info[key] = val
4805 return info
4806
4807
4808 def urshift(val, n):
4809 return val >> n if val >= 0 else (val + 0x100000000) >> n
4810
4811
4812 # Based on png2str() written by @gdkchan and improved by @yokrysty
4813 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4814 def decode_png(png_data):
4815 # Reference: https://www.w3.org/TR/PNG/
4816 header = png_data[8:]
4817
4818 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4819 raise OSError('Not a valid PNG file.')
4820
4821 int_map = {1: '>B', 2: '>H', 4: '>I'}
4822 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4823
4824 chunks = []
4825
4826 while header:
4827 length = unpack_integer(header[:4])
4828 header = header[4:]
4829
4830 chunk_type = header[:4]
4831 header = header[4:]
4832
4833 chunk_data = header[:length]
4834 header = header[length:]
4835
4836 header = header[4:] # Skip CRC
4837
4838 chunks.append({
4839 'type': chunk_type,
4840 'length': length,
4841 'data': chunk_data
4842 })
4843
4844 ihdr = chunks[0]['data']
4845
4846 width = unpack_integer(ihdr[:4])
4847 height = unpack_integer(ihdr[4:8])
4848
4849 idat = b''
4850
4851 for chunk in chunks:
4852 if chunk['type'] == b'IDAT':
4853 idat += chunk['data']
4854
4855 if not idat:
4856 raise OSError('Unable to read PNG data.')
4857
4858 decompressed_data = bytearray(zlib.decompress(idat))
4859
4860 stride = width * 3
4861 pixels = []
4862
4863 def _get_pixel(idx):
4864 x = idx % stride
4865 y = idx // stride
4866 return pixels[y][x]
4867
4868 for y in range(height):
4869 basePos = y * (1 + stride)
4870 filter_type = decompressed_data[basePos]
4871
4872 current_row = []
4873
4874 pixels.append(current_row)
4875
4876 for x in range(stride):
4877 color = decompressed_data[1 + basePos + x]
4878 basex = y * stride + x
4879 left = 0
4880 up = 0
4881
4882 if x > 2:
4883 left = _get_pixel(basex - 3)
4884 if y > 0:
4885 up = _get_pixel(basex - stride)
4886
4887 if filter_type == 1: # Sub
4888 color = (color + left) & 0xff
4889 elif filter_type == 2: # Up
4890 color = (color + up) & 0xff
4891 elif filter_type == 3: # Average
4892 color = (color + ((left + up) >> 1)) & 0xff
4893 elif filter_type == 4: # Paeth
4894 a = left
4895 b = up
4896 c = 0
4897
4898 if x > 2 and y > 0:
4899 c = _get_pixel(basex - stride - 3)
4900
4901 p = a + b - c
4902
4903 pa = abs(p - a)
4904 pb = abs(p - b)
4905 pc = abs(p - c)
4906
4907 if pa <= pb and pa <= pc:
4908 color = (color + a) & 0xff
4909 elif pb <= pc:
4910 color = (color + b) & 0xff
4911 else:
4912 color = (color + c) & 0xff
4913
4914 current_row.append(color)
4915
4916 return width, height, pixels
4917
4918
4919 def write_xattr(path, key, value):
4920 # Windows: Write xattrs to NTFS Alternate Data Streams:
4921 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4922 if compat_os_name == 'nt':
4923 assert ':' not in key
4924 assert os.path.exists(path)
4925
4926 try:
4927 with open(f'{path}:{key}', 'wb') as f:
4928 f.write(value)
4929 except OSError as e:
4930 raise XAttrMetadataError(e.errno, e.strerror)
4931 return
4932
4933 # UNIX Method 1. Use xattrs/pyxattrs modules
4934 from .dependencies import xattr
4935
4936 setxattr = None
4937 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4938 # Unicode arguments are not supported in pyxattr until version 0.5.0
4939 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4940 if version_tuple(xattr.__version__) >= (0, 5, 0):
4941 setxattr = xattr.set
4942 elif xattr:
4943 setxattr = xattr.setxattr
4944
4945 if setxattr:
4946 try:
4947 setxattr(path, key, value)
4948 except OSError as e:
4949 raise XAttrMetadataError(e.errno, e.strerror)
4950 return
4951
4952 # UNIX Method 2. Use setfattr/xattr executables
4953 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4954 else 'xattr' if check_executable('xattr', ['-h']) else None)
4955 if not exe:
4956 raise XAttrUnavailableError(
4957 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4958 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4959
4960 value = value.decode()
4961 try:
4962 _, stderr, returncode = Popen.run(
4963 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4964 stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4965 except OSError as e:
4966 raise XAttrMetadataError(e.errno, e.strerror)
4967 if returncode:
4968 raise XAttrMetadataError(returncode, stderr)
4969
4970
4971 def random_birthday(year_field, month_field, day_field):
4972 start_date = datetime.date(1950, 1, 1)
4973 end_date = datetime.date(1995, 12, 31)
4974 offset = random.randint(0, (end_date - start_date).days)
4975 random_date = start_date + datetime.timedelta(offset)
4976 return {
4977 year_field: str(random_date.year),
4978 month_field: str(random_date.month),
4979 day_field: str(random_date.day),
4980 }
4981
4982
4983 # Templates for internet shortcut files, which are plain text files.
4984 DOT_URL_LINK_TEMPLATE = '''\
4985 [InternetShortcut]
4986 URL=%(url)s
4987 '''
4988
4989 DOT_WEBLOC_LINK_TEMPLATE = '''\
4990 <?xml version="1.0" encoding="UTF-8"?>
4991 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4992 <plist version="1.0">
4993 <dict>
4994 \t<key>URL</key>
4995 \t<string>%(url)s</string>
4996 </dict>
4997 </plist>
4998 '''
4999
5000 DOT_DESKTOP_LINK_TEMPLATE = '''\
5001 [Desktop Entry]
5002 Encoding=UTF-8
5003 Name=%(filename)s
5004 Type=Link
5005 URL=%(url)s
5006 Icon=text-html
5007 '''
5008
5009 LINK_TEMPLATES = {
5010 'url': DOT_URL_LINK_TEMPLATE,
5011 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5012 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5013 }
5014
5015
5016 def iri_to_uri(iri):
5017 """
5018 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5019
5020 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5021 """
5022
5023 iri_parts = compat_urllib_parse_urlparse(iri)
5024
5025 if '[' in iri_parts.netloc:
5026 raise ValueError('IPv6 URIs are not, yet, supported.')
5027 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5028
5029 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5030
5031 net_location = ''
5032 if iri_parts.username:
5033 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5034 if iri_parts.password is not None:
5035 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5036 net_location += '@'
5037
5038 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5039 # The 'idna' encoding produces ASCII text.
5040 if iri_parts.port is not None and iri_parts.port != 80:
5041 net_location += ':' + str(iri_parts.port)
5042
5043 return urllib.parse.urlunparse(
5044 (iri_parts.scheme,
5045 net_location,
5046
5047 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5048
5049 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5050 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5051
5052 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5053 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5054
5055 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5056
5057 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5058
5059
5060 def to_high_limit_path(path):
5061 if sys.platform in ['win32', 'cygwin']:
5062 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5063 return '\\\\?\\' + os.path.abspath(path)
5064
5065 return path
5066
5067
5068 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=None):
5069 val = traverse_obj(obj, *variadic(field))
5070 if (not val and val != 0) if ignore is NO_DEFAULT else val in ignore:
5071 return default
5072 return template % (func(val) if func else val)
5073
5074
5075 def clean_podcast_url(url):
5076 return re.sub(r'''(?x)
5077 (?:
5078 (?:
5079 chtbl\.com/track|
5080 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5081 play\.podtrac\.com
5082 )/[^/]+|
5083 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5084 flex\.acast\.com|
5085 pd(?:
5086 cn\.co| # https://podcorn.com/analytics-prefix/
5087 st\.fm # https://podsights.com/docs/
5088 )/e
5089 )/''', '', url)
5090
5091
5092 _HEX_TABLE = '0123456789abcdef'
5093
5094
5095 def random_uuidv4():
5096 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5097
5098
5099 def make_dir(path, to_screen=None):
5100 try:
5101 dn = os.path.dirname(path)
5102 if dn and not os.path.exists(dn):
5103 os.makedirs(dn)
5104 return True
5105 except OSError as err:
5106 if callable(to_screen) is not None:
5107 to_screen('unable to create directory ' + error_to_compat_str(err))
5108 return False
5109
5110
5111 def get_executable_path():
5112 from .update import _get_variant_and_executable_path
5113
5114 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5115
5116
5117 def load_plugins(name, suffix, namespace):
5118 classes = {}
5119 with contextlib.suppress(FileNotFoundError):
5120 plugins_spec = importlib.util.spec_from_file_location(
5121 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5122 plugins = importlib.util.module_from_spec(plugins_spec)
5123 sys.modules[plugins_spec.name] = plugins
5124 plugins_spec.loader.exec_module(plugins)
5125 for name in dir(plugins):
5126 if name in namespace:
5127 continue
5128 if not name.endswith(suffix):
5129 continue
5130 klass = getattr(plugins, name)
5131 classes[name] = namespace[name] = klass
5132 return classes
5133
5134
5135 def traverse_obj(
5136 obj, *path_list, default=None, expected_type=None, get_all=True,
5137 casesense=True, is_user_input=False, traverse_string=False):
5138 ''' Traverse nested list/dict/tuple
5139 @param path_list A list of paths which are checked one by one.
5140 Each path is a list of keys where each key is a:
5141 - None: Do nothing
5142 - string: A dictionary key
5143 - int: An index into a list
5144 - tuple: A list of keys all of which will be traversed
5145 - Ellipsis: Fetch all values in the object
5146 - Function: Takes the key and value as arguments
5147 and returns whether the key matches or not
5148 @param default Default value to return
5149 @param expected_type Only accept final value of this type (Can also be any callable)
5150 @param get_all Return all the values obtained from a path or only the first one
5151 @param casesense Whether to consider dictionary keys as case sensitive
5152 @param is_user_input Whether the keys are generated from user input. If True,
5153 strings are converted to int/slice if necessary
5154 @param traverse_string Whether to traverse inside strings. If True, any
5155 non-compatible object will also be converted into a string
5156 # TODO: Write tests
5157 '''
5158 if not casesense:
5159 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5160 path_list = (map(_lower, variadic(path)) for path in path_list)
5161
5162 def _traverse_obj(obj, path, _current_depth=0):
5163 nonlocal depth
5164 path = tuple(variadic(path))
5165 for i, key in enumerate(path):
5166 if None in (key, obj):
5167 return obj
5168 if isinstance(key, (list, tuple)):
5169 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5170 key = ...
5171 if key is ...:
5172 obj = (obj.values() if isinstance(obj, dict)
5173 else obj if isinstance(obj, (list, tuple, LazyList))
5174 else str(obj) if traverse_string else [])
5175 _current_depth += 1
5176 depth = max(depth, _current_depth)
5177 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5178 elif callable(key):
5179 if isinstance(obj, (list, tuple, LazyList)):
5180 obj = enumerate(obj)
5181 elif isinstance(obj, dict):
5182 obj = obj.items()
5183 else:
5184 if not traverse_string:
5185 return None
5186 obj = str(obj)
5187 _current_depth += 1
5188 depth = max(depth, _current_depth)
5189 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5190 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5191 obj = (obj.get(key) if casesense or (key in obj)
5192 else next((v for k, v in obj.items() if _lower(k) == key), None))
5193 else:
5194 if is_user_input:
5195 key = (int_or_none(key) if ':' not in key
5196 else slice(*map(int_or_none, key.split(':'))))
5197 if key == slice(None):
5198 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5199 if not isinstance(key, (int, slice)):
5200 return None
5201 if not isinstance(obj, (list, tuple, LazyList)):
5202 if not traverse_string:
5203 return None
5204 obj = str(obj)
5205 try:
5206 obj = obj[key]
5207 except IndexError:
5208 return None
5209 return obj
5210
5211 if isinstance(expected_type, type):
5212 type_test = lambda val: val if isinstance(val, expected_type) else None
5213 elif expected_type is not None:
5214 type_test = expected_type
5215 else:
5216 type_test = lambda val: val
5217
5218 for path in path_list:
5219 depth = 0
5220 val = _traverse_obj(obj, path)
5221 if val is not None:
5222 if depth:
5223 for _ in range(depth - 1):
5224 val = itertools.chain.from_iterable(v for v in val if v is not None)
5225 val = [v for v in map(type_test, val) if v is not None]
5226 if val:
5227 return val if get_all else val[0]
5228 else:
5229 val = type_test(val)
5230 if val is not None:
5231 return val
5232 return default
5233
5234
5235 def traverse_dict(dictn, keys, casesense=True):
5236 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5237 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5238 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5239
5240
5241 def get_first(obj, keys, **kwargs):
5242 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5243
5244
5245 def variadic(x, allowed_types=(str, bytes, dict)):
5246 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5247
5248
5249 def decode_base(value, digits):
5250 # This will convert given base-x string to scalar (long or int)
5251 table = {char: index for index, char in enumerate(digits)}
5252 result = 0
5253 base = len(digits)
5254 for chr in value:
5255 result *= base
5256 result += table[chr]
5257 return result
5258
5259
5260 def time_seconds(**kwargs):
5261 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5262 return t.timestamp()
5263
5264
5265 # create a JSON Web Signature (jws) with HS256 algorithm
5266 # the resulting format is in JWS Compact Serialization
5267 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5268 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5269 def jwt_encode_hs256(payload_data, key, headers={}):
5270 header_data = {
5271 'alg': 'HS256',
5272 'typ': 'JWT',
5273 }
5274 if headers:
5275 header_data.update(headers)
5276 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5277 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5278 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5279 signature_b64 = base64.b64encode(h.digest())
5280 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5281 return token
5282
5283
5284 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5285 def jwt_decode_hs256(jwt):
5286 header_b64, payload_b64, signature_b64 = jwt.split('.')
5287 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5288 return payload_data
5289
5290
5291 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5292
5293
5294 @functools.cache
5295 def supports_terminal_sequences(stream):
5296 if compat_os_name == 'nt':
5297 if not WINDOWS_VT_MODE:
5298 return False
5299 elif not os.getenv('TERM'):
5300 return False
5301 try:
5302 return stream.isatty()
5303 except BaseException:
5304 return False
5305
5306
5307 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5308 if get_windows_version() < (10, 0, 10586):
5309 return
5310 global WINDOWS_VT_MODE
5311 try:
5312 Popen.run('', shell=True)
5313 except Exception:
5314 return
5315
5316 WINDOWS_VT_MODE = True
5317 supports_terminal_sequences.cache_clear()
5318
5319
5320 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5321
5322
5323 def remove_terminal_sequences(string):
5324 return _terminal_sequences_re.sub('', string)
5325
5326
5327 def number_of_digits(number):
5328 return len('%d' % number)
5329
5330
5331 def join_nonempty(*values, delim='-', from_dict=None):
5332 if from_dict is not None:
5333 values = map(from_dict.get, values)
5334 return delim.join(map(str, filter(None, values)))
5335
5336
5337 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5338 """
5339 Find the largest format dimensions in terms of video width and, for each thumbnail:
5340 * Modify the URL: Match the width with the provided regex and replace with the former width
5341 * Update dimensions
5342
5343 This function is useful with video services that scale the provided thumbnails on demand
5344 """
5345 _keys = ('width', 'height')
5346 max_dimensions = max(
5347 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5348 default=(0, 0))
5349 if not max_dimensions[0]:
5350 return thumbnails
5351 return [
5352 merge_dicts(
5353 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5354 dict(zip(_keys, max_dimensions)), thumbnail)
5355 for thumbnail in thumbnails
5356 ]
5357
5358
5359 def parse_http_range(range):
5360 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5361 if not range:
5362 return None, None, None
5363 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5364 if not crg:
5365 return None, None, None
5366 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5367
5368
5369 def read_stdin(what):
5370 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5371 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5372 return sys.stdin
5373
5374
5375 class Config:
5376 own_args = None
5377 parsed_args = None
5378 filename = None
5379 __initialized = False
5380
5381 def __init__(self, parser, label=None):
5382 self.parser, self.label = parser, label
5383 self._loaded_paths, self.configs = set(), []
5384
5385 def init(self, args=None, filename=None):
5386 assert not self.__initialized
5387 directory = ''
5388 if filename:
5389 location = os.path.realpath(filename)
5390 directory = os.path.dirname(location)
5391 if location in self._loaded_paths:
5392 return False
5393 self._loaded_paths.add(location)
5394
5395 self.own_args, self.__initialized = args, True
5396 opts, _ = self.parser.parse_known_args(args)
5397 self.parsed_args, self.filename = args, filename
5398
5399 for location in opts.config_locations or []:
5400 if location == '-':
5401 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5402 continue
5403 location = os.path.join(directory, expand_path(location))
5404 if os.path.isdir(location):
5405 location = os.path.join(location, 'yt-dlp.conf')
5406 if not os.path.exists(location):
5407 self.parser.error(f'config location {location} does not exist')
5408 self.append_config(self.read_file(location), location)
5409 return True
5410
5411 def __str__(self):
5412 label = join_nonempty(
5413 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5414 delim=' ')
5415 return join_nonempty(
5416 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5417 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5418 delim='\n')
5419
5420 @staticmethod
5421 def read_file(filename, default=[]):
5422 try:
5423 optionf = open(filename)
5424 except OSError:
5425 return default # silently skip if file is not present
5426 try:
5427 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5428 contents = optionf.read()
5429 res = shlex.split(contents, comments=True)
5430 finally:
5431 optionf.close()
5432 return res
5433
5434 @staticmethod
5435 def hide_login_info(opts):
5436 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5437 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5438
5439 def _scrub_eq(o):
5440 m = eqre.match(o)
5441 if m:
5442 return m.group('key') + '=PRIVATE'
5443 else:
5444 return o
5445
5446 opts = list(map(_scrub_eq, opts))
5447 for idx, opt in enumerate(opts):
5448 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5449 opts[idx + 1] = 'PRIVATE'
5450 return opts
5451
5452 def append_config(self, *args, label=None):
5453 config = type(self)(self.parser, label)
5454 config._loaded_paths = self._loaded_paths
5455 if config.init(*args):
5456 self.configs.append(config)
5457
5458 @property
5459 def all_args(self):
5460 for config in reversed(self.configs):
5461 yield from config.all_args
5462 yield from self.parsed_args or []
5463
5464 def parse_known_args(self, **kwargs):
5465 return self.parser.parse_known_args(self.all_args, **kwargs)
5466
5467 def parse_args(self):
5468 return self.parser.parse_args(self.all_args)
5469
5470
5471 class WebSocketsWrapper():
5472 """Wraps websockets module to use in non-async scopes"""
5473 pool = None
5474
5475 def __init__(self, url, headers=None, connect=True):
5476 self.loop = asyncio.new_event_loop()
5477 # XXX: "loop" is deprecated
5478 self.conn = websockets.connect(
5479 url, extra_headers=headers, ping_interval=None,
5480 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5481 if connect:
5482 self.__enter__()
5483 atexit.register(self.__exit__, None, None, None)
5484
5485 def __enter__(self):
5486 if not self.pool:
5487 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5488 return self
5489
5490 def send(self, *args):
5491 self.run_with_loop(self.pool.send(*args), self.loop)
5492
5493 def recv(self, *args):
5494 return self.run_with_loop(self.pool.recv(*args), self.loop)
5495
5496 def __exit__(self, type, value, traceback):
5497 try:
5498 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5499 finally:
5500 self.loop.close()
5501 self._cancel_all_tasks(self.loop)
5502
5503 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5504 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5505 @staticmethod
5506 def run_with_loop(main, loop):
5507 if not asyncio.iscoroutine(main):
5508 raise ValueError(f'a coroutine was expected, got {main!r}')
5509
5510 try:
5511 return loop.run_until_complete(main)
5512 finally:
5513 loop.run_until_complete(loop.shutdown_asyncgens())
5514 if hasattr(loop, 'shutdown_default_executor'):
5515 loop.run_until_complete(loop.shutdown_default_executor())
5516
5517 @staticmethod
5518 def _cancel_all_tasks(loop):
5519 to_cancel = asyncio.all_tasks(loop)
5520
5521 if not to_cancel:
5522 return
5523
5524 for task in to_cancel:
5525 task.cancel()
5526
5527 # XXX: "loop" is removed in python 3.10+
5528 loop.run_until_complete(
5529 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5530
5531 for task in to_cancel:
5532 if task.cancelled():
5533 continue
5534 if task.exception() is not None:
5535 loop.call_exception_handler({
5536 'message': 'unhandled exception during asyncio.run() shutdown',
5537 'exception': task.exception(),
5538 'task': task,
5539 })
5540
5541
5542 def merge_headers(*dicts):
5543 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5544 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5545
5546
5547 class classproperty:
5548 """classmethod(property(func)) that works in py < 3.9"""
5549
5550 def __init__(self, func):
5551 functools.update_wrapper(self, func)
5552 self.func = func
5553
5554 def __get__(self, _, cls):
5555 return self.func(cls)
5556
5557
5558 class Namespace(types.SimpleNamespace):
5559 """Immutable namespace"""
5560
5561 def __iter__(self):
5562 return iter(self.__dict__.values())
5563
5564 @property
5565 def items_(self):
5566 return self.__dict__.items()
5567
5568
5569 # Deprecated
5570 has_certifi = bool(certifi)
5571 has_websockets = bool(websockets)