]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[cleanup, utils] Don't use kwargs for `format_field`
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import ctypes
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import importlib.util
18 import io
19 import itertools
20 import json
21 import locale
22 import math
23 import mimetypes
24 import operator
25 import os
26 import platform
27 import random
28 import re
29 import shlex
30 import socket
31 import ssl
32 import subprocess
33 import sys
34 import tempfile
35 import time
36 import traceback
37 import types
38 import urllib.parse
39 import xml.etree.ElementTree
40 import zlib
41
42 from .compat import asyncio, functools # isort: split
43 from .compat import (
44 compat_chr,
45 compat_cookiejar,
46 compat_etree_fromstring,
47 compat_expanduser,
48 compat_html_entities,
49 compat_html_entities_html5,
50 compat_HTMLParseError,
51 compat_HTMLParser,
52 compat_http_client,
53 compat_HTTPError,
54 compat_os_name,
55 compat_parse_qs,
56 compat_shlex_quote,
57 compat_str,
58 compat_struct_pack,
59 compat_struct_unpack,
60 compat_urllib_error,
61 compat_urllib_parse_unquote_plus,
62 compat_urllib_parse_urlencode,
63 compat_urllib_parse_urlparse,
64 compat_urllib_request,
65 compat_urlparse,
66 )
67 from .dependencies import brotli, certifi, websockets
68 from .socks import ProxyType, sockssocket
69
70
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
80 # This is not clearly defined otherwise
81 compiled_regex_type = type(re.compile(''))
82
83
84 def random_user_agent():
85 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
86 _CHROME_VERSIONS = (
87 '90.0.4430.212',
88 '90.0.4430.24',
89 '90.0.4430.70',
90 '90.0.4430.72',
91 '90.0.4430.85',
92 '90.0.4430.93',
93 '91.0.4472.101',
94 '91.0.4472.106',
95 '91.0.4472.114',
96 '91.0.4472.124',
97 '91.0.4472.164',
98 '91.0.4472.19',
99 '91.0.4472.77',
100 '92.0.4515.107',
101 '92.0.4515.115',
102 '92.0.4515.131',
103 '92.0.4515.159',
104 '92.0.4515.43',
105 '93.0.4556.0',
106 '93.0.4577.15',
107 '93.0.4577.63',
108 '93.0.4577.82',
109 '94.0.4606.41',
110 '94.0.4606.54',
111 '94.0.4606.61',
112 '94.0.4606.71',
113 '94.0.4606.81',
114 '94.0.4606.85',
115 '95.0.4638.17',
116 '95.0.4638.50',
117 '95.0.4638.54',
118 '95.0.4638.69',
119 '95.0.4638.74',
120 '96.0.4664.18',
121 '96.0.4664.45',
122 '96.0.4664.55',
123 '96.0.4664.93',
124 '97.0.4692.20',
125 )
126 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
127
128
129 SUPPORTED_ENCODINGS = [
130 'gzip', 'deflate'
131 ]
132 if brotli:
133 SUPPORTED_ENCODINGS.append('br')
134
135 std_headers = {
136 'User-Agent': random_user_agent(),
137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
138 'Accept-Language': 'en-us,en;q=0.5',
139 'Sec-Fetch-Mode': 'navigate',
140 }
141
142
143 USER_AGENTS = {
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
145 }
146
147
148 NO_DEFAULT = object()
149
150 ENGLISH_MONTH_NAMES = [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
153
154 MONTH_NAMES = {
155 'en': ENGLISH_MONTH_NAMES,
156 'fr': [
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
159 }
160
161 KNOWN_EXTENSIONS = (
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
166 'avi', 'divx',
167 'mov',
168 'asf', 'wmv', 'wma',
169 '3gp', '3g2',
170 'mp3',
171 'flac',
172 'ape',
173 'wav',
174 'f4f', 'f4m', 'm3u8', 'smil')
175
176 # needed for sanitizing filenames in restricted mode
177 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
178 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
180
181 DATE_FORMATS = (
182 '%d %B %Y',
183 '%d %b %Y',
184 '%B %d %Y',
185 '%B %dst %Y',
186 '%B %dnd %Y',
187 '%B %drd %Y',
188 '%B %dth %Y',
189 '%b %d %Y',
190 '%b %dst %Y',
191 '%b %dnd %Y',
192 '%b %drd %Y',
193 '%b %dth %Y',
194 '%b %dst %Y %I:%M',
195 '%b %dnd %Y %I:%M',
196 '%b %drd %Y %I:%M',
197 '%b %dth %Y %I:%M',
198 '%Y %m %d',
199 '%Y-%m-%d',
200 '%Y.%m.%d.',
201 '%Y/%m/%d',
202 '%Y/%m/%d %H:%M',
203 '%Y/%m/%d %H:%M:%S',
204 '%Y%m%d%H%M',
205 '%Y%m%d%H%M%S',
206 '%Y%m%d',
207 '%Y-%m-%d %H:%M',
208 '%Y-%m-%d %H:%M:%S',
209 '%Y-%m-%d %H:%M:%S.%f',
210 '%Y-%m-%d %H:%M:%S:%f',
211 '%d.%m.%Y %H:%M',
212 '%d.%m.%Y %H.%M',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S',
217 '%Y-%m-%dT%H:%M:%S.%f',
218 '%Y-%m-%dT%H:%M',
219 '%b %d %Y at %H:%M',
220 '%b %d %Y at %H:%M:%S',
221 '%B %d %Y at %H:%M',
222 '%B %d %Y at %H:%M:%S',
223 '%H:%M %d-%b-%Y',
224 )
225
226 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
227 DATE_FORMATS_DAY_FIRST.extend([
228 '%d-%m-%Y',
229 '%d.%m.%Y',
230 '%d.%m.%y',
231 '%d/%m/%Y',
232 '%d/%m/%y',
233 '%d/%m/%Y %H:%M:%S',
234 ])
235
236 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
237 DATE_FORMATS_MONTH_FIRST.extend([
238 '%m-%d-%Y',
239 '%m.%d.%Y',
240 '%m/%d/%Y',
241 '%m/%d/%y',
242 '%m/%d/%Y %H:%M:%S',
243 ])
244
245 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
246 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
247
248 NUMBER_RE = r'\d+(?:\.\d+)?'
249
250
251 @functools.cache
252 def preferredencoding():
253 """Get preferred encoding.
254
255 Returns the best encoding scheme for the system, based on
256 locale.getpreferredencoding() and some further tweaks.
257 """
258 try:
259 pref = locale.getpreferredencoding()
260 'TEST'.encode(pref)
261 except Exception:
262 pref = 'UTF-8'
263
264 return pref
265
266
267 def write_json_file(obj, fn):
268 """ Encode obj as JSON and write it to fn, atomically if possible """
269
270 tf = tempfile.NamedTemporaryFile(
271 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
272 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
273
274 try:
275 with tf:
276 json.dump(obj, tf, ensure_ascii=False)
277 if sys.platform == 'win32':
278 # Need to remove existing file on Windows, else os.rename raises
279 # WindowsError or FileExistsError.
280 with contextlib.suppress(OSError):
281 os.unlink(fn)
282 with contextlib.suppress(OSError):
283 mask = os.umask(0)
284 os.umask(mask)
285 os.chmod(tf.name, 0o666 & ~mask)
286 os.rename(tf.name, fn)
287 except Exception:
288 with contextlib.suppress(OSError):
289 os.remove(tf.name)
290 raise
291
292
293 def find_xpath_attr(node, xpath, key, val=None):
294 """ Find the xpath xpath[@key=val] """
295 assert re.match(r'^[a-zA-Z_-]+$', key)
296 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
297 return node.find(expr)
298
299 # On python2.6 the xml.etree.ElementTree.Element methods don't support
300 # the namespace parameter
301
302
303 def xpath_with_ns(path, ns_map):
304 components = [c.split(':') for c in path.split('/')]
305 replaced = []
306 for c in components:
307 if len(c) == 1:
308 replaced.append(c[0])
309 else:
310 ns, tag = c
311 replaced.append('{%s}%s' % (ns_map[ns], tag))
312 return '/'.join(replaced)
313
314
315 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
316 def _find_xpath(xpath):
317 return node.find(xpath)
318
319 if isinstance(xpath, (str, compat_str)):
320 n = _find_xpath(xpath)
321 else:
322 for xp in xpath:
323 n = _find_xpath(xp)
324 if n is not None:
325 break
326
327 if n is None:
328 if default is not NO_DEFAULT:
329 return default
330 elif fatal:
331 name = xpath if name is None else name
332 raise ExtractorError('Could not find XML element %s' % name)
333 else:
334 return None
335 return n
336
337
338 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
339 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
340 if n is None or n == default:
341 return n
342 if n.text is None:
343 if default is not NO_DEFAULT:
344 return default
345 elif fatal:
346 name = xpath if name is None else name
347 raise ExtractorError('Could not find XML element\'s text %s' % name)
348 else:
349 return None
350 return n.text
351
352
353 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
354 n = find_xpath_attr(node, xpath, key)
355 if n is None:
356 if default is not NO_DEFAULT:
357 return default
358 elif fatal:
359 name = f'{xpath}[@{key}]' if name is None else name
360 raise ExtractorError('Could not find XML attribute %s' % name)
361 else:
362 return None
363 return n.attrib[key]
364
365
366 def get_element_by_id(id, html, **kwargs):
367 """Return the content of the tag with the specified ID in the passed HTML document"""
368 return get_element_by_attribute('id', id, html, **kwargs)
369
370
371 def get_element_html_by_id(id, html, **kwargs):
372 """Return the html of the tag with the specified ID in the passed HTML document"""
373 return get_element_html_by_attribute('id', id, html, **kwargs)
374
375
376 def get_element_by_class(class_name, html):
377 """Return the content of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
382 def get_element_html_by_class(class_name, html):
383 """Return the html of the first tag with the specified class in the passed HTML document"""
384 retval = get_elements_html_by_class(class_name, html)
385 return retval[0] if retval else None
386
387
388 def get_element_by_attribute(attribute, value, html, **kwargs):
389 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
390 return retval[0] if retval else None
391
392
393 def get_element_html_by_attribute(attribute, value, html, **kargs):
394 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
395 return retval[0] if retval else None
396
397
398 def get_elements_by_class(class_name, html, **kargs):
399 """Return the content of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_by_attribute(
401 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
402 html, escape_value=False)
403
404
405 def get_elements_html_by_class(class_name, html):
406 """Return the html of all tags with the specified class in the passed HTML document as a list"""
407 return get_elements_html_by_attribute(
408 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
409 html, escape_value=False)
410
411
412 def get_elements_by_attribute(*args, **kwargs):
413 """Return the content of the tag with the specified attribute in the passed HTML document"""
414 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
415
416
417 def get_elements_html_by_attribute(*args, **kwargs):
418 """Return the html of the tag with the specified attribute in the passed HTML document"""
419 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
420
421
422 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
423 """
424 Return the text (content) and the html (whole) of the tag with the specified
425 attribute in the passed HTML document
426 """
427
428 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
429
430 value = re.escape(value) if escape_value else value
431
432 partial_element_re = rf'''(?x)
433 <(?P<tag>[a-zA-Z0-9:._-]+)
434 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
435 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
436 '''
437
438 for m in re.finditer(partial_element_re, html):
439 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
440
441 yield (
442 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
443 whole
444 )
445
446
447 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
448 """
449 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
450 closing tag for the first opening tag it has encountered, and can be used
451 as a context manager
452 """
453
454 class HTMLBreakOnClosingTagException(Exception):
455 pass
456
457 def __init__(self):
458 self.tagstack = collections.deque()
459 compat_HTMLParser.__init__(self)
460
461 def __enter__(self):
462 return self
463
464 def __exit__(self, *_):
465 self.close()
466
467 def close(self):
468 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
469 # so data remains buffered; we no longer have any interest in it, thus
470 # override this method to discard it
471 pass
472
473 def handle_starttag(self, tag, _):
474 self.tagstack.append(tag)
475
476 def handle_endtag(self, tag):
477 if not self.tagstack:
478 raise compat_HTMLParseError('no tags in the stack')
479 while self.tagstack:
480 inner_tag = self.tagstack.pop()
481 if inner_tag == tag:
482 break
483 else:
484 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
485 if not self.tagstack:
486 raise self.HTMLBreakOnClosingTagException()
487
488
489 def get_element_text_and_html_by_tag(tag, html):
490 """
491 For the first element with the specified tag in the passed HTML document
492 return its' content (text) and the whole element (html)
493 """
494 def find_or_raise(haystack, needle, exc):
495 try:
496 return haystack.index(needle)
497 except ValueError:
498 raise exc
499 closing_tag = f'</{tag}>'
500 whole_start = find_or_raise(
501 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
502 content_start = find_or_raise(
503 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
504 content_start += whole_start + 1
505 with HTMLBreakOnClosingTagParser() as parser:
506 parser.feed(html[whole_start:content_start])
507 if not parser.tagstack or parser.tagstack[0] != tag:
508 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
509 offset = content_start
510 while offset < len(html):
511 next_closing_tag_start = find_or_raise(
512 html[offset:], closing_tag,
513 compat_HTMLParseError(f'closing {tag} tag not found'))
514 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
515 try:
516 parser.feed(html[offset:offset + next_closing_tag_end])
517 offset += next_closing_tag_end
518 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
519 return html[content_start:offset + next_closing_tag_start], \
520 html[whole_start:offset + next_closing_tag_end]
521 raise compat_HTMLParseError('unexpected end of html')
522
523
524 class HTMLAttributeParser(compat_HTMLParser):
525 """Trivial HTML parser to gather the attributes for a single element"""
526
527 def __init__(self):
528 self.attrs = {}
529 compat_HTMLParser.__init__(self)
530
531 def handle_starttag(self, tag, attrs):
532 self.attrs = dict(attrs)
533
534
535 class HTMLListAttrsParser(compat_HTMLParser):
536 """HTML parser to gather the attributes for the elements of a list"""
537
538 def __init__(self):
539 compat_HTMLParser.__init__(self)
540 self.items = []
541 self._level = 0
542
543 def handle_starttag(self, tag, attrs):
544 if tag == 'li' and self._level == 0:
545 self.items.append(dict(attrs))
546 self._level += 1
547
548 def handle_endtag(self, tag):
549 self._level -= 1
550
551
552 def extract_attributes(html_element):
553 """Given a string for an HTML element such as
554 <el
555 a="foo" B="bar" c="&98;az" d=boz
556 empty= noval entity="&amp;"
557 sq='"' dq="'"
558 >
559 Decode and return a dictionary of attributes.
560 {
561 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
562 'empty': '', 'noval': None, 'entity': '&',
563 'sq': '"', 'dq': '\''
564 }.
565 """
566 parser = HTMLAttributeParser()
567 with contextlib.suppress(compat_HTMLParseError):
568 parser.feed(html_element)
569 parser.close()
570 return parser.attrs
571
572
573 def parse_list(webpage):
574 """Given a string for an series of HTML <li> elements,
575 return a dictionary of their attributes"""
576 parser = HTMLListAttrsParser()
577 parser.feed(webpage)
578 parser.close()
579 return parser.items
580
581
582 def clean_html(html):
583 """Clean an HTML snippet into a readable string"""
584
585 if html is None: # Convenience for sanitizing descriptions etc.
586 return html
587
588 html = re.sub(r'\s+', ' ', html)
589 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
590 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
591 # Strip html tags
592 html = re.sub('<.*?>', '', html)
593 # Replace html entities
594 html = unescapeHTML(html)
595 return html.strip()
596
597
598 class LenientJSONDecoder(json.JSONDecoder):
599 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
600 self.transform_source, self.ignore_extra = transform_source, ignore_extra
601 super().__init__(*args, **kwargs)
602
603 def decode(self, s):
604 if self.transform_source:
605 s = self.transform_source(s)
606 if self.ignore_extra:
607 return self.raw_decode(s.lstrip())[0]
608 return super().decode(s)
609
610
611 def sanitize_open(filename, open_mode):
612 """Try to open the given filename, and slightly tweak it if this fails.
613
614 Attempts to open the given filename. If this fails, it tries to change
615 the filename slightly, step by step, until it's either able to open it
616 or it fails and raises a final exception, like the standard open()
617 function.
618
619 It returns the tuple (stream, definitive_file_name).
620 """
621 if filename == '-':
622 if sys.platform == 'win32':
623 import msvcrt
624 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
625 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
626
627 for attempt in range(2):
628 try:
629 try:
630 if sys.platform == 'win32':
631 # FIXME: An exclusive lock also locks the file from being read.
632 # Since windows locks are mandatory, don't lock the file on windows (for now).
633 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
634 raise LockingUnsupportedError()
635 stream = locked_file(filename, open_mode, block=False).__enter__()
636 except OSError:
637 stream = open(filename, open_mode)
638 return stream, filename
639 except OSError as err:
640 if attempt or err.errno in (errno.EACCES,):
641 raise
642 old_filename, filename = filename, sanitize_path(filename)
643 if old_filename == filename:
644 raise
645
646
647 def timeconvert(timestr):
648 """Convert RFC 2822 defined time string into system timestamp"""
649 timestamp = None
650 timetuple = email.utils.parsedate_tz(timestr)
651 if timetuple is not None:
652 timestamp = email.utils.mktime_tz(timetuple)
653 return timestamp
654
655
656 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
657 """Sanitizes a string so it could be used as part of a filename.
658 @param restricted Use a stricter subset of allowed characters
659 @param is_id Whether this is an ID that should be kept unchanged if possible.
660 If unset, yt-dlp's new sanitization rules are in effect
661 """
662 if s == '':
663 return ''
664
665 def replace_insane(char):
666 if restricted and char in ACCENT_CHARS:
667 return ACCENT_CHARS[char]
668 elif not restricted and char == '\n':
669 return '\0 '
670 elif char == '?' or ord(char) < 32 or ord(char) == 127:
671 return ''
672 elif char == '"':
673 return '' if restricted else '\''
674 elif char == ':':
675 return '\0_\0-' if restricted else '\0 \0-'
676 elif char in '\\/|*<>':
677 return '\0_'
678 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
679 return '\0_'
680 return char
681
682 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
683 result = ''.join(map(replace_insane, s))
684 if is_id is NO_DEFAULT:
685 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
686 STRIP_RE = '(?:\0.|[ _-])*'
687 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
688 result = result.replace('\0', '') or '_'
689
690 if not is_id:
691 while '__' in result:
692 result = result.replace('__', '_')
693 result = result.strip('_')
694 # Common case of "Foreign band name - English song title"
695 if restricted and result.startswith('-_'):
696 result = result[2:]
697 if result.startswith('-'):
698 result = '_' + result[len('-'):]
699 result = result.lstrip('.')
700 if not result:
701 result = '_'
702 return result
703
704
705 def sanitize_path(s, force=False):
706 """Sanitizes and normalizes path on Windows"""
707 if sys.platform == 'win32':
708 force = False
709 drive_or_unc, _ = os.path.splitdrive(s)
710 elif force:
711 drive_or_unc = ''
712 else:
713 return s
714
715 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
716 if drive_or_unc:
717 norm_path.pop(0)
718 sanitized_path = [
719 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
720 for path_part in norm_path]
721 if drive_or_unc:
722 sanitized_path.insert(0, drive_or_unc + os.path.sep)
723 elif force and s and s[0] == os.path.sep:
724 sanitized_path.insert(0, os.path.sep)
725 return os.path.join(*sanitized_path)
726
727
728 def sanitize_url(url):
729 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
730 # the number of unwanted failures due to missing protocol
731 if url is None:
732 return
733 elif url.startswith('//'):
734 return 'http:%s' % url
735 # Fix some common typos seen so far
736 COMMON_TYPOS = (
737 # https://github.com/ytdl-org/youtube-dl/issues/15649
738 (r'^httpss://', r'https://'),
739 # https://bx1.be/lives/direct-tv/
740 (r'^rmtp([es]?)://', r'rtmp\1://'),
741 )
742 for mistake, fixup in COMMON_TYPOS:
743 if re.match(mistake, url):
744 return re.sub(mistake, fixup, url)
745 return url
746
747
748 def extract_basic_auth(url):
749 parts = compat_urlparse.urlsplit(url)
750 if parts.username is None:
751 return url, None
752 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
753 parts.hostname if parts.port is None
754 else '%s:%d' % (parts.hostname, parts.port))))
755 auth_payload = base64.b64encode(
756 ('%s:%s' % (parts.username, parts.password or '')).encode())
757 return url, f'Basic {auth_payload.decode()}'
758
759
760 def sanitized_Request(url, *args, **kwargs):
761 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
762 if auth_header is not None:
763 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
764 headers['Authorization'] = auth_header
765 return compat_urllib_request.Request(url, *args, **kwargs)
766
767
768 def expand_path(s):
769 """Expand shell variables and ~"""
770 return os.path.expandvars(compat_expanduser(s))
771
772
773 def orderedSet(iterable, *, lazy=False):
774 """Remove all duplicates from the input iterable"""
775 def _iter():
776 seen = [] # Do not use set since the items can be unhashable
777 for x in iterable:
778 if x not in seen:
779 seen.append(x)
780 yield x
781
782 return _iter() if lazy else list(_iter())
783
784
785 def _htmlentity_transform(entity_with_semicolon):
786 """Transforms an HTML entity to a character."""
787 entity = entity_with_semicolon[:-1]
788
789 # Known non-numeric HTML entity
790 if entity in compat_html_entities.name2codepoint:
791 return compat_chr(compat_html_entities.name2codepoint[entity])
792
793 # TODO: HTML5 allows entities without a semicolon. For example,
794 # '&Eacuteric' should be decoded as 'Éric'.
795 if entity_with_semicolon in compat_html_entities_html5:
796 return compat_html_entities_html5[entity_with_semicolon]
797
798 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
799 if mobj is not None:
800 numstr = mobj.group(1)
801 if numstr.startswith('x'):
802 base = 16
803 numstr = '0%s' % numstr
804 else:
805 base = 10
806 # See https://github.com/ytdl-org/youtube-dl/issues/7518
807 with contextlib.suppress(ValueError):
808 return compat_chr(int(numstr, base))
809
810 # Unknown entity in name, return its literal representation
811 return '&%s;' % entity
812
813
814 def unescapeHTML(s):
815 if s is None:
816 return None
817 assert isinstance(s, str)
818
819 return re.sub(
820 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
821
822
823 def escapeHTML(text):
824 return (
825 text
826 .replace('&', '&amp;')
827 .replace('<', '&lt;')
828 .replace('>', '&gt;')
829 .replace('"', '&quot;')
830 .replace("'", '&#39;')
831 )
832
833
834 def process_communicate_or_kill(p, *args, **kwargs):
835 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
836 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
837 return Popen.communicate_or_kill(p, *args, **kwargs)
838
839
840 class Popen(subprocess.Popen):
841 if sys.platform == 'win32':
842 _startupinfo = subprocess.STARTUPINFO()
843 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
844 else:
845 _startupinfo = None
846
847 def __init__(self, *args, text=False, **kwargs):
848 if text is True:
849 kwargs['universal_newlines'] = True # For 3.6 compatibility
850 kwargs.setdefault('encoding', 'utf-8')
851 kwargs.setdefault('errors', 'replace')
852 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
853
854 def communicate_or_kill(self, *args, **kwargs):
855 try:
856 return self.communicate(*args, **kwargs)
857 except BaseException: # Including KeyboardInterrupt
858 self.kill(timeout=None)
859 raise
860
861 def kill(self, *, timeout=0):
862 super().kill()
863 if timeout != 0:
864 self.wait(timeout=timeout)
865
866 @classmethod
867 def run(cls, *args, **kwargs):
868 with cls(*args, **kwargs) as proc:
869 stdout, stderr = proc.communicate_or_kill()
870 return stdout or '', stderr or '', proc.returncode
871
872
873 def get_subprocess_encoding():
874 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
875 # For subprocess calls, encode with locale encoding
876 # Refer to http://stackoverflow.com/a/9951851/35070
877 encoding = preferredencoding()
878 else:
879 encoding = sys.getfilesystemencoding()
880 if encoding is None:
881 encoding = 'utf-8'
882 return encoding
883
884
885 def encodeFilename(s, for_subprocess=False):
886 assert isinstance(s, str)
887 return s
888
889
890 def decodeFilename(b, for_subprocess=False):
891 return b
892
893
894 def encodeArgument(s):
895 # Legacy code that uses byte strings
896 # Uncomment the following line after fixing all post processors
897 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
898 return s if isinstance(s, str) else s.decode('ascii')
899
900
901 def decodeArgument(b):
902 return b
903
904
905 def decodeOption(optval):
906 if optval is None:
907 return optval
908 if isinstance(optval, bytes):
909 optval = optval.decode(preferredencoding())
910
911 assert isinstance(optval, compat_str)
912 return optval
913
914
915 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
916
917
918 def timetuple_from_msec(msec):
919 secs, msec = divmod(msec, 1000)
920 mins, secs = divmod(secs, 60)
921 hrs, mins = divmod(mins, 60)
922 return _timetuple(hrs, mins, secs, msec)
923
924
925 def formatSeconds(secs, delim=':', msec=False):
926 time = timetuple_from_msec(secs * 1000)
927 if time.hours:
928 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
929 elif time.minutes:
930 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
931 else:
932 ret = '%d' % time.seconds
933 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
934
935
936 def _ssl_load_windows_store_certs(ssl_context, storename):
937 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
938 try:
939 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
940 if encoding == 'x509_asn' and (
941 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
942 except PermissionError:
943 return
944 for cert in certs:
945 with contextlib.suppress(ssl.SSLError):
946 ssl_context.load_verify_locations(cadata=cert)
947
948
949 def make_HTTPS_handler(params, **kwargs):
950 opts_check_certificate = not params.get('nocheckcertificate')
951 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
952 context.check_hostname = opts_check_certificate
953 if params.get('legacyserverconnect'):
954 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
955 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
956 context.set_ciphers('DEFAULT')
957
958 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
959 if opts_check_certificate:
960 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
961 context.load_verify_locations(cafile=certifi.where())
962 try:
963 context.load_default_certs()
964 # Work around the issue in load_default_certs when there are bad certificates. See:
965 # https://github.com/yt-dlp/yt-dlp/issues/1060,
966 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
967 except ssl.SSLError:
968 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
969 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
970 for storename in ('CA', 'ROOT'):
971 _ssl_load_windows_store_certs(context, storename)
972 context.set_default_verify_paths()
973
974 client_certfile = params.get('client_certificate')
975 if client_certfile:
976 try:
977 context.load_cert_chain(
978 client_certfile, keyfile=params.get('client_certificate_key'),
979 password=params.get('client_certificate_password'))
980 except ssl.SSLError:
981 raise YoutubeDLError('Unable to load client certificate')
982
983 # Some servers may reject requests if ALPN extension is not sent. See:
984 # https://github.com/python/cpython/issues/85140
985 # https://github.com/yt-dlp/yt-dlp/issues/3878
986 with contextlib.suppress(NotImplementedError):
987 context.set_alpn_protocols(['http/1.1'])
988
989 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
990
991
992 def bug_reports_message(before=';'):
993 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
994 'filling out the appropriate issue template. '
995 'Confirm you are on the latest version using yt-dlp -U')
996
997 before = before.rstrip()
998 if not before or before.endswith(('.', '!', '?')):
999 msg = msg[0].title() + msg[1:]
1000
1001 return (before + ' ' if before else '') + msg
1002
1003
1004 class YoutubeDLError(Exception):
1005 """Base exception for YoutubeDL errors."""
1006 msg = None
1007
1008 def __init__(self, msg=None):
1009 if msg is not None:
1010 self.msg = msg
1011 elif self.msg is None:
1012 self.msg = type(self).__name__
1013 super().__init__(self.msg)
1014
1015
1016 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1017 if hasattr(ssl, 'CertificateError'):
1018 network_exceptions.append(ssl.CertificateError)
1019 network_exceptions = tuple(network_exceptions)
1020
1021
1022 class ExtractorError(YoutubeDLError):
1023 """Error during info extraction."""
1024
1025 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1026 """ tb, if given, is the original traceback (so that it can be printed out).
1027 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1028 """
1029 if sys.exc_info()[0] in network_exceptions:
1030 expected = True
1031
1032 self.orig_msg = str(msg)
1033 self.traceback = tb
1034 self.expected = expected
1035 self.cause = cause
1036 self.video_id = video_id
1037 self.ie = ie
1038 self.exc_info = sys.exc_info() # preserve original exception
1039
1040 super().__init__(''.join((
1041 format_field(ie, None, '[%s] '),
1042 format_field(video_id, None, '%s: '),
1043 msg,
1044 format_field(cause, None, ' (caused by %r)'),
1045 '' if expected else bug_reports_message())))
1046
1047 def format_traceback(self):
1048 return join_nonempty(
1049 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1050 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1051 delim='\n') or None
1052
1053
1054 class UnsupportedError(ExtractorError):
1055 def __init__(self, url):
1056 super().__init__(
1057 'Unsupported URL: %s' % url, expected=True)
1058 self.url = url
1059
1060
1061 class RegexNotFoundError(ExtractorError):
1062 """Error when a regex didn't match"""
1063 pass
1064
1065
1066 class GeoRestrictedError(ExtractorError):
1067 """Geographic restriction Error exception.
1068
1069 This exception may be thrown when a video is not available from your
1070 geographic location due to geographic restrictions imposed by a website.
1071 """
1072
1073 def __init__(self, msg, countries=None, **kwargs):
1074 kwargs['expected'] = True
1075 super().__init__(msg, **kwargs)
1076 self.countries = countries
1077
1078
1079 class DownloadError(YoutubeDLError):
1080 """Download Error exception.
1081
1082 This exception may be thrown by FileDownloader objects if they are not
1083 configured to continue on errors. They will contain the appropriate
1084 error message.
1085 """
1086
1087 def __init__(self, msg, exc_info=None):
1088 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1089 super().__init__(msg)
1090 self.exc_info = exc_info
1091
1092
1093 class EntryNotInPlaylist(YoutubeDLError):
1094 """Entry not in playlist exception.
1095
1096 This exception will be thrown by YoutubeDL when a requested entry
1097 is not found in the playlist info_dict
1098 """
1099 msg = 'Entry not found in info'
1100
1101
1102 class SameFileError(YoutubeDLError):
1103 """Same File exception.
1104
1105 This exception will be thrown by FileDownloader objects if they detect
1106 multiple files would have to be downloaded to the same file on disk.
1107 """
1108 msg = 'Fixed output name but more than one file to download'
1109
1110 def __init__(self, filename=None):
1111 if filename is not None:
1112 self.msg += f': {filename}'
1113 super().__init__(self.msg)
1114
1115
1116 class PostProcessingError(YoutubeDLError):
1117 """Post Processing exception.
1118
1119 This exception may be raised by PostProcessor's .run() method to
1120 indicate an error in the postprocessing task.
1121 """
1122
1123
1124 class DownloadCancelled(YoutubeDLError):
1125 """ Exception raised when the download queue should be interrupted """
1126 msg = 'The download was cancelled'
1127
1128
1129 class ExistingVideoReached(DownloadCancelled):
1130 """ --break-on-existing triggered """
1131 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1132
1133
1134 class RejectedVideoReached(DownloadCancelled):
1135 """ --break-on-reject triggered """
1136 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1137
1138
1139 class MaxDownloadsReached(DownloadCancelled):
1140 """ --max-downloads limit has been reached. """
1141 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1142
1143
1144 class ReExtractInfo(YoutubeDLError):
1145 """ Video info needs to be re-extracted. """
1146
1147 def __init__(self, msg, expected=False):
1148 super().__init__(msg)
1149 self.expected = expected
1150
1151
1152 class ThrottledDownload(ReExtractInfo):
1153 """ Download speed below --throttled-rate. """
1154 msg = 'The download speed is below throttle limit'
1155
1156 def __init__(self):
1157 super().__init__(self.msg, expected=False)
1158
1159
1160 class UnavailableVideoError(YoutubeDLError):
1161 """Unavailable Format exception.
1162
1163 This exception will be thrown when a video is requested
1164 in a format that is not available for that video.
1165 """
1166 msg = 'Unable to download video'
1167
1168 def __init__(self, err=None):
1169 if err is not None:
1170 self.msg += f': {err}'
1171 super().__init__(self.msg)
1172
1173
1174 class ContentTooShortError(YoutubeDLError):
1175 """Content Too Short exception.
1176
1177 This exception may be raised by FileDownloader objects when a file they
1178 download is too small for what the server announced first, indicating
1179 the connection was probably interrupted.
1180 """
1181
1182 def __init__(self, downloaded, expected):
1183 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1184 # Both in bytes
1185 self.downloaded = downloaded
1186 self.expected = expected
1187
1188
1189 class XAttrMetadataError(YoutubeDLError):
1190 def __init__(self, code=None, msg='Unknown error'):
1191 super().__init__(msg)
1192 self.code = code
1193 self.msg = msg
1194
1195 # Parsing code and msg
1196 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1197 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1198 self.reason = 'NO_SPACE'
1199 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1200 self.reason = 'VALUE_TOO_LONG'
1201 else:
1202 self.reason = 'NOT_SUPPORTED'
1203
1204
1205 class XAttrUnavailableError(YoutubeDLError):
1206 pass
1207
1208
1209 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1210 hc = http_class(*args, **kwargs)
1211 source_address = ydl_handler._params.get('source_address')
1212
1213 if source_address is not None:
1214 # This is to workaround _create_connection() from socket where it will try all
1215 # address data from getaddrinfo() including IPv6. This filters the result from
1216 # getaddrinfo() based on the source_address value.
1217 # This is based on the cpython socket.create_connection() function.
1218 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1219 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1220 host, port = address
1221 err = None
1222 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1223 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1224 ip_addrs = [addr for addr in addrs if addr[0] == af]
1225 if addrs and not ip_addrs:
1226 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1227 raise OSError(
1228 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1229 % (ip_version, source_address[0]))
1230 for res in ip_addrs:
1231 af, socktype, proto, canonname, sa = res
1232 sock = None
1233 try:
1234 sock = socket.socket(af, socktype, proto)
1235 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1236 sock.settimeout(timeout)
1237 sock.bind(source_address)
1238 sock.connect(sa)
1239 err = None # Explicitly break reference cycle
1240 return sock
1241 except OSError as _:
1242 err = _
1243 if sock is not None:
1244 sock.close()
1245 if err is not None:
1246 raise err
1247 else:
1248 raise OSError('getaddrinfo returns an empty list')
1249 if hasattr(hc, '_create_connection'):
1250 hc._create_connection = _create_connection
1251 hc.source_address = (source_address, 0)
1252
1253 return hc
1254
1255
1256 def handle_youtubedl_headers(headers):
1257 filtered_headers = headers
1258
1259 if 'Youtubedl-no-compression' in filtered_headers:
1260 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1261 del filtered_headers['Youtubedl-no-compression']
1262
1263 return filtered_headers
1264
1265
1266 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1267 """Handler for HTTP requests and responses.
1268
1269 This class, when installed with an OpenerDirector, automatically adds
1270 the standard headers to every HTTP request and handles gzipped and
1271 deflated responses from web servers. If compression is to be avoided in
1272 a particular request, the original request in the program code only has
1273 to include the HTTP header "Youtubedl-no-compression", which will be
1274 removed before making the real request.
1275
1276 Part of this code was copied from:
1277
1278 http://techknack.net/python-urllib2-handlers/
1279
1280 Andrew Rowls, the author of that code, agreed to release it to the
1281 public domain.
1282 """
1283
1284 def __init__(self, params, *args, **kwargs):
1285 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1286 self._params = params
1287
1288 def http_open(self, req):
1289 conn_class = compat_http_client.HTTPConnection
1290
1291 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1292 if socks_proxy:
1293 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1294 del req.headers['Ytdl-socks-proxy']
1295
1296 return self.do_open(functools.partial(
1297 _create_http_connection, self, conn_class, False),
1298 req)
1299
1300 @staticmethod
1301 def deflate(data):
1302 if not data:
1303 return data
1304 try:
1305 return zlib.decompress(data, -zlib.MAX_WBITS)
1306 except zlib.error:
1307 return zlib.decompress(data)
1308
1309 @staticmethod
1310 def brotli(data):
1311 if not data:
1312 return data
1313 return brotli.decompress(data)
1314
1315 def http_request(self, req):
1316 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1317 # always respected by websites, some tend to give out URLs with non percent-encoded
1318 # non-ASCII characters (see telemb.py, ard.py [#3412])
1319 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1320 # To work around aforementioned issue we will replace request's original URL with
1321 # percent-encoded one
1322 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1323 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1324 url = req.get_full_url()
1325 url_escaped = escape_url(url)
1326
1327 # Substitute URL if any change after escaping
1328 if url != url_escaped:
1329 req = update_Request(req, url=url_escaped)
1330
1331 for h, v in self._params.get('http_headers', std_headers).items():
1332 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1333 # The dict keys are capitalized because of this bug by urllib
1334 if h.capitalize() not in req.headers:
1335 req.add_header(h, v)
1336
1337 if 'Accept-encoding' not in req.headers:
1338 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1339
1340 req.headers = handle_youtubedl_headers(req.headers)
1341
1342 return req
1343
1344 def http_response(self, req, resp):
1345 old_resp = resp
1346 # gzip
1347 if resp.headers.get('Content-encoding', '') == 'gzip':
1348 content = resp.read()
1349 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1350 try:
1351 uncompressed = io.BytesIO(gz.read())
1352 except OSError as original_ioerror:
1353 # There may be junk add the end of the file
1354 # See http://stackoverflow.com/q/4928560/35070 for details
1355 for i in range(1, 1024):
1356 try:
1357 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1358 uncompressed = io.BytesIO(gz.read())
1359 except OSError:
1360 continue
1361 break
1362 else:
1363 raise original_ioerror
1364 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1365 resp.msg = old_resp.msg
1366 del resp.headers['Content-encoding']
1367 # deflate
1368 if resp.headers.get('Content-encoding', '') == 'deflate':
1369 gz = io.BytesIO(self.deflate(resp.read()))
1370 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1371 resp.msg = old_resp.msg
1372 del resp.headers['Content-encoding']
1373 # brotli
1374 if resp.headers.get('Content-encoding', '') == 'br':
1375 resp = compat_urllib_request.addinfourl(
1376 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1377 resp.msg = old_resp.msg
1378 del resp.headers['Content-encoding']
1379 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1380 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1381 if 300 <= resp.code < 400:
1382 location = resp.headers.get('Location')
1383 if location:
1384 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1385 location = location.encode('iso-8859-1').decode()
1386 location_escaped = escape_url(location)
1387 if location != location_escaped:
1388 del resp.headers['Location']
1389 resp.headers['Location'] = location_escaped
1390 return resp
1391
1392 https_request = http_request
1393 https_response = http_response
1394
1395
1396 def make_socks_conn_class(base_class, socks_proxy):
1397 assert issubclass(base_class, (
1398 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1399
1400 url_components = compat_urlparse.urlparse(socks_proxy)
1401 if url_components.scheme.lower() == 'socks5':
1402 socks_type = ProxyType.SOCKS5
1403 elif url_components.scheme.lower() in ('socks', 'socks4'):
1404 socks_type = ProxyType.SOCKS4
1405 elif url_components.scheme.lower() == 'socks4a':
1406 socks_type = ProxyType.SOCKS4A
1407
1408 def unquote_if_non_empty(s):
1409 if not s:
1410 return s
1411 return compat_urllib_parse_unquote_plus(s)
1412
1413 proxy_args = (
1414 socks_type,
1415 url_components.hostname, url_components.port or 1080,
1416 True, # Remote DNS
1417 unquote_if_non_empty(url_components.username),
1418 unquote_if_non_empty(url_components.password),
1419 )
1420
1421 class SocksConnection(base_class):
1422 def connect(self):
1423 self.sock = sockssocket()
1424 self.sock.setproxy(*proxy_args)
1425 if isinstance(self.timeout, (int, float)):
1426 self.sock.settimeout(self.timeout)
1427 self.sock.connect((self.host, self.port))
1428
1429 if isinstance(self, compat_http_client.HTTPSConnection):
1430 if hasattr(self, '_context'): # Python > 2.6
1431 self.sock = self._context.wrap_socket(
1432 self.sock, server_hostname=self.host)
1433 else:
1434 self.sock = ssl.wrap_socket(self.sock)
1435
1436 return SocksConnection
1437
1438
1439 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1440 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1441 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1442 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1443 self._params = params
1444
1445 def https_open(self, req):
1446 kwargs = {}
1447 conn_class = self._https_conn_class
1448
1449 if hasattr(self, '_context'): # python > 2.6
1450 kwargs['context'] = self._context
1451 if hasattr(self, '_check_hostname'): # python 3.x
1452 kwargs['check_hostname'] = self._check_hostname
1453
1454 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1455 if socks_proxy:
1456 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1457 del req.headers['Ytdl-socks-proxy']
1458
1459 try:
1460 return self.do_open(
1461 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1462 except urllib.error.URLError as e:
1463 if (isinstance(e.reason, ssl.SSLError)
1464 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1465 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1466 raise
1467
1468
1469 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1470 """
1471 See [1] for cookie file format.
1472
1473 1. https://curl.haxx.se/docs/http-cookies.html
1474 """
1475 _HTTPONLY_PREFIX = '#HttpOnly_'
1476 _ENTRY_LEN = 7
1477 _HEADER = '''# Netscape HTTP Cookie File
1478 # This file is generated by yt-dlp. Do not edit.
1479
1480 '''
1481 _CookieFileEntry = collections.namedtuple(
1482 'CookieFileEntry',
1483 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1484
1485 def __init__(self, filename=None, *args, **kwargs):
1486 super().__init__(None, *args, **kwargs)
1487 if self.is_path(filename):
1488 filename = os.fspath(filename)
1489 self.filename = filename
1490
1491 @staticmethod
1492 def _true_or_false(cndn):
1493 return 'TRUE' if cndn else 'FALSE'
1494
1495 @staticmethod
1496 def is_path(file):
1497 return isinstance(file, (str, bytes, os.PathLike))
1498
1499 @contextlib.contextmanager
1500 def open(self, file, *, write=False):
1501 if self.is_path(file):
1502 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1503 yield f
1504 else:
1505 if write:
1506 file.truncate(0)
1507 yield file
1508
1509 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1510 now = time.time()
1511 for cookie in self:
1512 if (not ignore_discard and cookie.discard
1513 or not ignore_expires and cookie.is_expired(now)):
1514 continue
1515 name, value = cookie.name, cookie.value
1516 if value is None:
1517 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1518 # with no name, whereas http.cookiejar regards it as a
1519 # cookie with no value.
1520 name, value = '', name
1521 f.write('%s\n' % '\t'.join((
1522 cookie.domain,
1523 self._true_or_false(cookie.domain.startswith('.')),
1524 cookie.path,
1525 self._true_or_false(cookie.secure),
1526 str_or_none(cookie.expires, default=''),
1527 name, value
1528 )))
1529
1530 def save(self, filename=None, *args, **kwargs):
1531 """
1532 Save cookies to a file.
1533 Code is taken from CPython 3.6
1534 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1535
1536 if filename is None:
1537 if self.filename is not None:
1538 filename = self.filename
1539 else:
1540 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1541
1542 # Store session cookies with `expires` set to 0 instead of an empty string
1543 for cookie in self:
1544 if cookie.expires is None:
1545 cookie.expires = 0
1546
1547 with self.open(filename, write=True) as f:
1548 f.write(self._HEADER)
1549 self._really_save(f, *args, **kwargs)
1550
1551 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1552 """Load cookies from a file."""
1553 if filename is None:
1554 if self.filename is not None:
1555 filename = self.filename
1556 else:
1557 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1558
1559 def prepare_line(line):
1560 if line.startswith(self._HTTPONLY_PREFIX):
1561 line = line[len(self._HTTPONLY_PREFIX):]
1562 # comments and empty lines are fine
1563 if line.startswith('#') or not line.strip():
1564 return line
1565 cookie_list = line.split('\t')
1566 if len(cookie_list) != self._ENTRY_LEN:
1567 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1568 cookie = self._CookieFileEntry(*cookie_list)
1569 if cookie.expires_at and not cookie.expires_at.isdigit():
1570 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1571 return line
1572
1573 cf = io.StringIO()
1574 with self.open(filename) as f:
1575 for line in f:
1576 try:
1577 cf.write(prepare_line(line))
1578 except compat_cookiejar.LoadError as e:
1579 if f'{line.strip()} '[0] in '[{"':
1580 raise compat_cookiejar.LoadError(
1581 'Cookies file must be Netscape formatted, not JSON. See '
1582 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1583 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1584 continue
1585 cf.seek(0)
1586 self._really_load(cf, filename, ignore_discard, ignore_expires)
1587 # Session cookies are denoted by either `expires` field set to
1588 # an empty string or 0. MozillaCookieJar only recognizes the former
1589 # (see [1]). So we need force the latter to be recognized as session
1590 # cookies on our own.
1591 # Session cookies may be important for cookies-based authentication,
1592 # e.g. usually, when user does not check 'Remember me' check box while
1593 # logging in on a site, some important cookies are stored as session
1594 # cookies so that not recognizing them will result in failed login.
1595 # 1. https://bugs.python.org/issue17164
1596 for cookie in self:
1597 # Treat `expires=0` cookies as session cookies
1598 if cookie.expires == 0:
1599 cookie.expires = None
1600 cookie.discard = True
1601
1602
1603 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1604 def __init__(self, cookiejar=None):
1605 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1606
1607 def http_response(self, request, response):
1608 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1609
1610 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1611 https_response = http_response
1612
1613
1614 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1615 """YoutubeDL redirect handler
1616
1617 The code is based on HTTPRedirectHandler implementation from CPython [1].
1618
1619 This redirect handler solves two issues:
1620 - ensures redirect URL is always unicode under python 2
1621 - introduces support for experimental HTTP response status code
1622 308 Permanent Redirect [2] used by some sites [3]
1623
1624 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1625 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1626 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1627 """
1628
1629 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1630
1631 def redirect_request(self, req, fp, code, msg, headers, newurl):
1632 """Return a Request or None in response to a redirect.
1633
1634 This is called by the http_error_30x methods when a
1635 redirection response is received. If a redirection should
1636 take place, return a new Request to allow http_error_30x to
1637 perform the redirect. Otherwise, raise HTTPError if no-one
1638 else should try to handle this url. Return None if you can't
1639 but another Handler might.
1640 """
1641 m = req.get_method()
1642 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1643 or code in (301, 302, 303) and m == "POST")):
1644 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1645 # Strictly (according to RFC 2616), 301 or 302 in response to
1646 # a POST MUST NOT cause a redirection without confirmation
1647 # from the user (of urllib.request, in this case). In practice,
1648 # essentially all clients do redirect in this case, so we do
1649 # the same.
1650
1651 # Be conciliant with URIs containing a space. This is mainly
1652 # redundant with the more complete encoding done in http_error_302(),
1653 # but it is kept for compatibility with other callers.
1654 newurl = newurl.replace(' ', '%20')
1655
1656 CONTENT_HEADERS = ("content-length", "content-type")
1657 # NB: don't use dict comprehension for python 2.6 compatibility
1658 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1659
1660 # A 303 must either use GET or HEAD for subsequent request
1661 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1662 if code == 303 and m != 'HEAD':
1663 m = 'GET'
1664 # 301 and 302 redirects are commonly turned into a GET from a POST
1665 # for subsequent requests by browsers, so we'll do the same.
1666 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1667 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1668 if code in (301, 302) and m == 'POST':
1669 m = 'GET'
1670
1671 return compat_urllib_request.Request(
1672 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1673 unverifiable=True, method=m)
1674
1675
1676 def extract_timezone(date_str):
1677 m = re.search(
1678 r'''(?x)
1679 ^.{8,}? # >=8 char non-TZ prefix, if present
1680 (?P<tz>Z| # just the UTC Z, or
1681 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1682 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1683 [ ]? # optional space
1684 (?P<sign>\+|-) # +/-
1685 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1686 $)
1687 ''', date_str)
1688 if not m:
1689 timezone = datetime.timedelta()
1690 else:
1691 date_str = date_str[:-len(m.group('tz'))]
1692 if not m.group('sign'):
1693 timezone = datetime.timedelta()
1694 else:
1695 sign = 1 if m.group('sign') == '+' else -1
1696 timezone = datetime.timedelta(
1697 hours=sign * int(m.group('hours')),
1698 minutes=sign * int(m.group('minutes')))
1699 return timezone, date_str
1700
1701
1702 def parse_iso8601(date_str, delimiter='T', timezone=None):
1703 """ Return a UNIX timestamp from the given date """
1704
1705 if date_str is None:
1706 return None
1707
1708 date_str = re.sub(r'\.[0-9]+', '', date_str)
1709
1710 if timezone is None:
1711 timezone, date_str = extract_timezone(date_str)
1712
1713 with contextlib.suppress(ValueError):
1714 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1715 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1716 return calendar.timegm(dt.timetuple())
1717
1718
1719 def date_formats(day_first=True):
1720 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1721
1722
1723 def unified_strdate(date_str, day_first=True):
1724 """Return a string with the date in the format YYYYMMDD"""
1725
1726 if date_str is None:
1727 return None
1728 upload_date = None
1729 # Replace commas
1730 date_str = date_str.replace(',', ' ')
1731 # Remove AM/PM + timezone
1732 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1733 _, date_str = extract_timezone(date_str)
1734
1735 for expression in date_formats(day_first):
1736 with contextlib.suppress(ValueError):
1737 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1738 if upload_date is None:
1739 timetuple = email.utils.parsedate_tz(date_str)
1740 if timetuple:
1741 with contextlib.suppress(ValueError):
1742 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1743 if upload_date is not None:
1744 return compat_str(upload_date)
1745
1746
1747 def unified_timestamp(date_str, day_first=True):
1748 if date_str is None:
1749 return None
1750
1751 date_str = re.sub(r'[,|]', '', date_str)
1752
1753 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1754 timezone, date_str = extract_timezone(date_str)
1755
1756 # Remove AM/PM + timezone
1757 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1758
1759 # Remove unrecognized timezones from ISO 8601 alike timestamps
1760 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1761 if m:
1762 date_str = date_str[:-len(m.group('tz'))]
1763
1764 # Python only supports microseconds, so remove nanoseconds
1765 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1766 if m:
1767 date_str = m.group(1)
1768
1769 for expression in date_formats(day_first):
1770 with contextlib.suppress(ValueError):
1771 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1772 return calendar.timegm(dt.timetuple())
1773 timetuple = email.utils.parsedate_tz(date_str)
1774 if timetuple:
1775 return calendar.timegm(timetuple) + pm_delta * 3600
1776
1777
1778 def determine_ext(url, default_ext='unknown_video'):
1779 if url is None or '.' not in url:
1780 return default_ext
1781 guess = url.partition('?')[0].rpartition('.')[2]
1782 if re.match(r'^[A-Za-z0-9]+$', guess):
1783 return guess
1784 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1785 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1786 return guess.rstrip('/')
1787 else:
1788 return default_ext
1789
1790
1791 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1792 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1793
1794
1795 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1796 R"""
1797 Return a datetime object from a string.
1798 Supported format:
1799 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1800
1801 @param format strftime format of DATE
1802 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1803 auto: round to the unit provided in date_str (if applicable).
1804 """
1805 auto_precision = False
1806 if precision == 'auto':
1807 auto_precision = True
1808 precision = 'microsecond'
1809 today = datetime_round(datetime.datetime.utcnow(), precision)
1810 if date_str in ('now', 'today'):
1811 return today
1812 if date_str == 'yesterday':
1813 return today - datetime.timedelta(days=1)
1814 match = re.match(
1815 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1816 date_str)
1817 if match is not None:
1818 start_time = datetime_from_str(match.group('start'), precision, format)
1819 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1820 unit = match.group('unit')
1821 if unit == 'month' or unit == 'year':
1822 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1823 unit = 'day'
1824 else:
1825 if unit == 'week':
1826 unit = 'day'
1827 time *= 7
1828 delta = datetime.timedelta(**{unit + 's': time})
1829 new_date = start_time + delta
1830 if auto_precision:
1831 return datetime_round(new_date, unit)
1832 return new_date
1833
1834 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1835
1836
1837 def date_from_str(date_str, format='%Y%m%d', strict=False):
1838 R"""
1839 Return a date object from a string using datetime_from_str
1840
1841 @param strict Restrict allowed patterns to "YYYYMMDD" and
1842 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1843 """
1844 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1845 raise ValueError(f'Invalid date format "{date_str}"')
1846 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1847
1848
1849 def datetime_add_months(dt, months):
1850 """Increment/Decrement a datetime object by months."""
1851 month = dt.month + months - 1
1852 year = dt.year + month // 12
1853 month = month % 12 + 1
1854 day = min(dt.day, calendar.monthrange(year, month)[1])
1855 return dt.replace(year, month, day)
1856
1857
1858 def datetime_round(dt, precision='day'):
1859 """
1860 Round a datetime object's time to a specific precision
1861 """
1862 if precision == 'microsecond':
1863 return dt
1864
1865 unit_seconds = {
1866 'day': 86400,
1867 'hour': 3600,
1868 'minute': 60,
1869 'second': 1,
1870 }
1871 roundto = lambda x, n: ((x + n / 2) // n) * n
1872 timestamp = calendar.timegm(dt.timetuple())
1873 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1874
1875
1876 def hyphenate_date(date_str):
1877 """
1878 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1879 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1880 if match is not None:
1881 return '-'.join(match.groups())
1882 else:
1883 return date_str
1884
1885
1886 class DateRange:
1887 """Represents a time interval between two dates"""
1888
1889 def __init__(self, start=None, end=None):
1890 """start and end must be strings in the format accepted by date"""
1891 if start is not None:
1892 self.start = date_from_str(start, strict=True)
1893 else:
1894 self.start = datetime.datetime.min.date()
1895 if end is not None:
1896 self.end = date_from_str(end, strict=True)
1897 else:
1898 self.end = datetime.datetime.max.date()
1899 if self.start > self.end:
1900 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1901
1902 @classmethod
1903 def day(cls, day):
1904 """Returns a range that only contains the given day"""
1905 return cls(day, day)
1906
1907 def __contains__(self, date):
1908 """Check if the date is in the range"""
1909 if not isinstance(date, datetime.date):
1910 date = date_from_str(date)
1911 return self.start <= date <= self.end
1912
1913 def __str__(self):
1914 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1915
1916
1917 def platform_name():
1918 """ Returns the platform name as a compat_str """
1919 res = platform.platform()
1920 if isinstance(res, bytes):
1921 res = res.decode(preferredencoding())
1922
1923 assert isinstance(res, compat_str)
1924 return res
1925
1926
1927 @functools.cache
1928 def get_windows_version():
1929 ''' Get Windows version. returns () if it's not running on Windows '''
1930 if compat_os_name == 'nt':
1931 return version_tuple(platform.win32_ver()[1])
1932 else:
1933 return ()
1934
1935
1936 def write_string(s, out=None, encoding=None):
1937 assert isinstance(s, str)
1938 out = out or sys.stderr
1939
1940 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1941 s = re.sub(r'([\r\n]+)', r' \1', s)
1942
1943 enc, buffer = None, out
1944 if 'b' in getattr(out, 'mode', ''):
1945 enc = encoding or preferredencoding()
1946 elif hasattr(out, 'buffer'):
1947 buffer = out.buffer
1948 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1949
1950 buffer.write(s.encode(enc, 'ignore') if enc else s)
1951 out.flush()
1952
1953
1954 def bytes_to_intlist(bs):
1955 if not bs:
1956 return []
1957 if isinstance(bs[0], int): # Python 3
1958 return list(bs)
1959 else:
1960 return [ord(c) for c in bs]
1961
1962
1963 def intlist_to_bytes(xs):
1964 if not xs:
1965 return b''
1966 return compat_struct_pack('%dB' % len(xs), *xs)
1967
1968
1969 class LockingUnsupportedError(OSError):
1970 msg = 'File locking is not supported'
1971
1972 def __init__(self):
1973 super().__init__(self.msg)
1974
1975
1976 # Cross-platform file locking
1977 if sys.platform == 'win32':
1978 import ctypes.wintypes
1979 import msvcrt
1980
1981 class OVERLAPPED(ctypes.Structure):
1982 _fields_ = [
1983 ('Internal', ctypes.wintypes.LPVOID),
1984 ('InternalHigh', ctypes.wintypes.LPVOID),
1985 ('Offset', ctypes.wintypes.DWORD),
1986 ('OffsetHigh', ctypes.wintypes.DWORD),
1987 ('hEvent', ctypes.wintypes.HANDLE),
1988 ]
1989
1990 kernel32 = ctypes.windll.kernel32
1991 LockFileEx = kernel32.LockFileEx
1992 LockFileEx.argtypes = [
1993 ctypes.wintypes.HANDLE, # hFile
1994 ctypes.wintypes.DWORD, # dwFlags
1995 ctypes.wintypes.DWORD, # dwReserved
1996 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1997 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1998 ctypes.POINTER(OVERLAPPED) # Overlapped
1999 ]
2000 LockFileEx.restype = ctypes.wintypes.BOOL
2001 UnlockFileEx = kernel32.UnlockFileEx
2002 UnlockFileEx.argtypes = [
2003 ctypes.wintypes.HANDLE, # hFile
2004 ctypes.wintypes.DWORD, # dwReserved
2005 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2006 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2007 ctypes.POINTER(OVERLAPPED) # Overlapped
2008 ]
2009 UnlockFileEx.restype = ctypes.wintypes.BOOL
2010 whole_low = 0xffffffff
2011 whole_high = 0x7fffffff
2012
2013 def _lock_file(f, exclusive, block):
2014 overlapped = OVERLAPPED()
2015 overlapped.Offset = 0
2016 overlapped.OffsetHigh = 0
2017 overlapped.hEvent = 0
2018 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2019
2020 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2021 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2022 0, whole_low, whole_high, f._lock_file_overlapped_p):
2023 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2024 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2025
2026 def _unlock_file(f):
2027 assert f._lock_file_overlapped_p
2028 handle = msvcrt.get_osfhandle(f.fileno())
2029 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2030 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2031
2032 else:
2033 try:
2034 import fcntl
2035
2036 def _lock_file(f, exclusive, block):
2037 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2038 if not block:
2039 flags |= fcntl.LOCK_NB
2040 try:
2041 fcntl.flock(f, flags)
2042 except BlockingIOError:
2043 raise
2044 except OSError: # AOSP does not have flock()
2045 fcntl.lockf(f, flags)
2046
2047 def _unlock_file(f):
2048 try:
2049 fcntl.flock(f, fcntl.LOCK_UN)
2050 except OSError:
2051 fcntl.lockf(f, fcntl.LOCK_UN)
2052
2053 except ImportError:
2054
2055 def _lock_file(f, exclusive, block):
2056 raise LockingUnsupportedError()
2057
2058 def _unlock_file(f):
2059 raise LockingUnsupportedError()
2060
2061
2062 class locked_file:
2063 locked = False
2064
2065 def __init__(self, filename, mode, block=True, encoding=None):
2066 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2067 raise NotImplementedError(mode)
2068 self.mode, self.block = mode, block
2069
2070 writable = any(f in mode for f in 'wax+')
2071 readable = any(f in mode for f in 'r+')
2072 flags = functools.reduce(operator.ior, (
2073 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2074 getattr(os, 'O_BINARY', 0), # Windows only
2075 getattr(os, 'O_NOINHERIT', 0), # Windows only
2076 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2077 os.O_APPEND if 'a' in mode else 0,
2078 os.O_EXCL if 'x' in mode else 0,
2079 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2080 ))
2081
2082 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2083
2084 def __enter__(self):
2085 exclusive = 'r' not in self.mode
2086 try:
2087 _lock_file(self.f, exclusive, self.block)
2088 self.locked = True
2089 except OSError:
2090 self.f.close()
2091 raise
2092 if 'w' in self.mode:
2093 try:
2094 self.f.truncate()
2095 except OSError as e:
2096 if e.errno not in (
2097 errno.ESPIPE, # Illegal seek - expected for FIFO
2098 errno.EINVAL, # Invalid argument - expected for /dev/null
2099 ):
2100 raise
2101 return self
2102
2103 def unlock(self):
2104 if not self.locked:
2105 return
2106 try:
2107 _unlock_file(self.f)
2108 finally:
2109 self.locked = False
2110
2111 def __exit__(self, *_):
2112 try:
2113 self.unlock()
2114 finally:
2115 self.f.close()
2116
2117 open = __enter__
2118 close = __exit__
2119
2120 def __getattr__(self, attr):
2121 return getattr(self.f, attr)
2122
2123 def __iter__(self):
2124 return iter(self.f)
2125
2126
2127 @functools.cache
2128 def get_filesystem_encoding():
2129 encoding = sys.getfilesystemencoding()
2130 return encoding if encoding is not None else 'utf-8'
2131
2132
2133 def shell_quote(args):
2134 quoted_args = []
2135 encoding = get_filesystem_encoding()
2136 for a in args:
2137 if isinstance(a, bytes):
2138 # We may get a filename encoded with 'encodeFilename'
2139 a = a.decode(encoding)
2140 quoted_args.append(compat_shlex_quote(a))
2141 return ' '.join(quoted_args)
2142
2143
2144 def smuggle_url(url, data):
2145 """ Pass additional data in a URL for internal use. """
2146
2147 url, idata = unsmuggle_url(url, {})
2148 data.update(idata)
2149 sdata = compat_urllib_parse_urlencode(
2150 {'__youtubedl_smuggle': json.dumps(data)})
2151 return url + '#' + sdata
2152
2153
2154 def unsmuggle_url(smug_url, default=None):
2155 if '#__youtubedl_smuggle' not in smug_url:
2156 return smug_url, default
2157 url, _, sdata = smug_url.rpartition('#')
2158 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2159 data = json.loads(jsond)
2160 return url, data
2161
2162
2163 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2164 """ Formats numbers with decimal sufixes like K, M, etc """
2165 num, factor = float_or_none(num), float(factor)
2166 if num is None or num < 0:
2167 return None
2168 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2169 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2170 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2171 if factor == 1024:
2172 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2173 converted = num / (factor ** exponent)
2174 return fmt % (converted, suffix)
2175
2176
2177 def format_bytes(bytes):
2178 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2179
2180
2181 def lookup_unit_table(unit_table, s):
2182 units_re = '|'.join(re.escape(u) for u in unit_table)
2183 m = re.match(
2184 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2185 if not m:
2186 return None
2187 num_str = m.group('num').replace(',', '.')
2188 mult = unit_table[m.group('unit')]
2189 return int(float(num_str) * mult)
2190
2191
2192 def parse_filesize(s):
2193 if s is None:
2194 return None
2195
2196 # The lower-case forms are of course incorrect and unofficial,
2197 # but we support those too
2198 _UNIT_TABLE = {
2199 'B': 1,
2200 'b': 1,
2201 'bytes': 1,
2202 'KiB': 1024,
2203 'KB': 1000,
2204 'kB': 1024,
2205 'Kb': 1000,
2206 'kb': 1000,
2207 'kilobytes': 1000,
2208 'kibibytes': 1024,
2209 'MiB': 1024 ** 2,
2210 'MB': 1000 ** 2,
2211 'mB': 1024 ** 2,
2212 'Mb': 1000 ** 2,
2213 'mb': 1000 ** 2,
2214 'megabytes': 1000 ** 2,
2215 'mebibytes': 1024 ** 2,
2216 'GiB': 1024 ** 3,
2217 'GB': 1000 ** 3,
2218 'gB': 1024 ** 3,
2219 'Gb': 1000 ** 3,
2220 'gb': 1000 ** 3,
2221 'gigabytes': 1000 ** 3,
2222 'gibibytes': 1024 ** 3,
2223 'TiB': 1024 ** 4,
2224 'TB': 1000 ** 4,
2225 'tB': 1024 ** 4,
2226 'Tb': 1000 ** 4,
2227 'tb': 1000 ** 4,
2228 'terabytes': 1000 ** 4,
2229 'tebibytes': 1024 ** 4,
2230 'PiB': 1024 ** 5,
2231 'PB': 1000 ** 5,
2232 'pB': 1024 ** 5,
2233 'Pb': 1000 ** 5,
2234 'pb': 1000 ** 5,
2235 'petabytes': 1000 ** 5,
2236 'pebibytes': 1024 ** 5,
2237 'EiB': 1024 ** 6,
2238 'EB': 1000 ** 6,
2239 'eB': 1024 ** 6,
2240 'Eb': 1000 ** 6,
2241 'eb': 1000 ** 6,
2242 'exabytes': 1000 ** 6,
2243 'exbibytes': 1024 ** 6,
2244 'ZiB': 1024 ** 7,
2245 'ZB': 1000 ** 7,
2246 'zB': 1024 ** 7,
2247 'Zb': 1000 ** 7,
2248 'zb': 1000 ** 7,
2249 'zettabytes': 1000 ** 7,
2250 'zebibytes': 1024 ** 7,
2251 'YiB': 1024 ** 8,
2252 'YB': 1000 ** 8,
2253 'yB': 1024 ** 8,
2254 'Yb': 1000 ** 8,
2255 'yb': 1000 ** 8,
2256 'yottabytes': 1000 ** 8,
2257 'yobibytes': 1024 ** 8,
2258 }
2259
2260 return lookup_unit_table(_UNIT_TABLE, s)
2261
2262
2263 def parse_count(s):
2264 if s is None:
2265 return None
2266
2267 s = re.sub(r'^[^\d]+\s', '', s).strip()
2268
2269 if re.match(r'^[\d,.]+$', s):
2270 return str_to_int(s)
2271
2272 _UNIT_TABLE = {
2273 'k': 1000,
2274 'K': 1000,
2275 'm': 1000 ** 2,
2276 'M': 1000 ** 2,
2277 'kk': 1000 ** 2,
2278 'KK': 1000 ** 2,
2279 'b': 1000 ** 3,
2280 'B': 1000 ** 3,
2281 }
2282
2283 ret = lookup_unit_table(_UNIT_TABLE, s)
2284 if ret is not None:
2285 return ret
2286
2287 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2288 if mobj:
2289 return str_to_int(mobj.group(1))
2290
2291
2292 def parse_resolution(s, *, lenient=False):
2293 if s is None:
2294 return {}
2295
2296 if lenient:
2297 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2298 else:
2299 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2300 if mobj:
2301 return {
2302 'width': int(mobj.group('w')),
2303 'height': int(mobj.group('h')),
2304 }
2305
2306 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2307 if mobj:
2308 return {'height': int(mobj.group(1))}
2309
2310 mobj = re.search(r'\b([48])[kK]\b', s)
2311 if mobj:
2312 return {'height': int(mobj.group(1)) * 540}
2313
2314 return {}
2315
2316
2317 def parse_bitrate(s):
2318 if not isinstance(s, compat_str):
2319 return
2320 mobj = re.search(r'\b(\d+)\s*kbps', s)
2321 if mobj:
2322 return int(mobj.group(1))
2323
2324
2325 def month_by_name(name, lang='en'):
2326 """ Return the number of a month by (locale-independently) English name """
2327
2328 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2329
2330 try:
2331 return month_names.index(name) + 1
2332 except ValueError:
2333 return None
2334
2335
2336 def month_by_abbreviation(abbrev):
2337 """ Return the number of a month by (locale-independently) English
2338 abbreviations """
2339
2340 try:
2341 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2342 except ValueError:
2343 return None
2344
2345
2346 def fix_xml_ampersands(xml_str):
2347 """Replace all the '&' by '&amp;' in XML"""
2348 return re.sub(
2349 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2350 '&amp;',
2351 xml_str)
2352
2353
2354 def setproctitle(title):
2355 assert isinstance(title, compat_str)
2356
2357 # ctypes in Jython is not complete
2358 # http://bugs.jython.org/issue2148
2359 if sys.platform.startswith('java'):
2360 return
2361
2362 try:
2363 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2364 except OSError:
2365 return
2366 except TypeError:
2367 # LoadLibrary in Windows Python 2.7.13 only expects
2368 # a bytestring, but since unicode_literals turns
2369 # every string into a unicode string, it fails.
2370 return
2371 title_bytes = title.encode()
2372 buf = ctypes.create_string_buffer(len(title_bytes))
2373 buf.value = title_bytes
2374 try:
2375 libc.prctl(15, buf, 0, 0, 0)
2376 except AttributeError:
2377 return # Strange libc, just skip this
2378
2379
2380 def remove_start(s, start):
2381 return s[len(start):] if s is not None and s.startswith(start) else s
2382
2383
2384 def remove_end(s, end):
2385 return s[:-len(end)] if s is not None and s.endswith(end) else s
2386
2387
2388 def remove_quotes(s):
2389 if s is None or len(s) < 2:
2390 return s
2391 for quote in ('"', "'", ):
2392 if s[0] == quote and s[-1] == quote:
2393 return s[1:-1]
2394 return s
2395
2396
2397 def get_domain(url):
2398 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2399 return domain.group('domain') if domain else None
2400
2401
2402 def url_basename(url):
2403 path = compat_urlparse.urlparse(url).path
2404 return path.strip('/').split('/')[-1]
2405
2406
2407 def base_url(url):
2408 return re.match(r'https?://[^?#&]+/', url).group()
2409
2410
2411 def urljoin(base, path):
2412 if isinstance(path, bytes):
2413 path = path.decode()
2414 if not isinstance(path, compat_str) or not path:
2415 return None
2416 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2417 return path
2418 if isinstance(base, bytes):
2419 base = base.decode()
2420 if not isinstance(base, compat_str) or not re.match(
2421 r'^(?:https?:)?//', base):
2422 return None
2423 return compat_urlparse.urljoin(base, path)
2424
2425
2426 class HEADRequest(compat_urllib_request.Request):
2427 def get_method(self):
2428 return 'HEAD'
2429
2430
2431 class PUTRequest(compat_urllib_request.Request):
2432 def get_method(self):
2433 return 'PUT'
2434
2435
2436 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2437 if get_attr and v is not None:
2438 v = getattr(v, get_attr, None)
2439 try:
2440 return int(v) * invscale // scale
2441 except (ValueError, TypeError, OverflowError):
2442 return default
2443
2444
2445 def str_or_none(v, default=None):
2446 return default if v is None else compat_str(v)
2447
2448
2449 def str_to_int(int_str):
2450 """ A more relaxed version of int_or_none """
2451 if isinstance(int_str, int):
2452 return int_str
2453 elif isinstance(int_str, compat_str):
2454 int_str = re.sub(r'[,\.\+]', '', int_str)
2455 return int_or_none(int_str)
2456
2457
2458 def float_or_none(v, scale=1, invscale=1, default=None):
2459 if v is None:
2460 return default
2461 try:
2462 return float(v) * invscale / scale
2463 except (ValueError, TypeError):
2464 return default
2465
2466
2467 def bool_or_none(v, default=None):
2468 return v if isinstance(v, bool) else default
2469
2470
2471 def strip_or_none(v, default=None):
2472 return v.strip() if isinstance(v, compat_str) else default
2473
2474
2475 def url_or_none(url):
2476 if not url or not isinstance(url, compat_str):
2477 return None
2478 url = url.strip()
2479 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2480
2481
2482 def request_to_url(req):
2483 if isinstance(req, compat_urllib_request.Request):
2484 return req.get_full_url()
2485 else:
2486 return req
2487
2488
2489 def strftime_or_none(timestamp, date_format, default=None):
2490 datetime_object = None
2491 try:
2492 if isinstance(timestamp, (int, float)): # unix timestamp
2493 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2494 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2495 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2496 return datetime_object.strftime(date_format)
2497 except (ValueError, TypeError, AttributeError):
2498 return default
2499
2500
2501 def parse_duration(s):
2502 if not isinstance(s, str):
2503 return None
2504 s = s.strip()
2505 if not s:
2506 return None
2507
2508 days, hours, mins, secs, ms = [None] * 5
2509 m = re.match(r'''(?x)
2510 (?P<before_secs>
2511 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2512 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2513 (?P<ms>[.:][0-9]+)?Z?$
2514 ''', s)
2515 if m:
2516 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2517 else:
2518 m = re.match(
2519 r'''(?ix)(?:P?
2520 (?:
2521 [0-9]+\s*y(?:ears?)?,?\s*
2522 )?
2523 (?:
2524 [0-9]+\s*m(?:onths?)?,?\s*
2525 )?
2526 (?:
2527 [0-9]+\s*w(?:eeks?)?,?\s*
2528 )?
2529 (?:
2530 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2531 )?
2532 T)?
2533 (?:
2534 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2535 )?
2536 (?:
2537 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2538 )?
2539 (?:
2540 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2541 )?Z?$''', s)
2542 if m:
2543 days, hours, mins, secs, ms = m.groups()
2544 else:
2545 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2546 if m:
2547 hours, mins = m.groups()
2548 else:
2549 return None
2550
2551 if ms:
2552 ms = ms.replace(':', '.')
2553 return sum(float(part or 0) * mult for part, mult in (
2554 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2555
2556
2557 def prepend_extension(filename, ext, expected_real_ext=None):
2558 name, real_ext = os.path.splitext(filename)
2559 return (
2560 f'{name}.{ext}{real_ext}'
2561 if not expected_real_ext or real_ext[1:] == expected_real_ext
2562 else f'{filename}.{ext}')
2563
2564
2565 def replace_extension(filename, ext, expected_real_ext=None):
2566 name, real_ext = os.path.splitext(filename)
2567 return '{}.{}'.format(
2568 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2569 ext)
2570
2571
2572 def check_executable(exe, args=[]):
2573 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2574 args can be a list of arguments for a short output (like -version) """
2575 try:
2576 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2577 except OSError:
2578 return False
2579 return exe
2580
2581
2582 def _get_exe_version_output(exe, args, *, to_screen=None):
2583 if to_screen:
2584 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2585 try:
2586 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2587 # SIGTTOU if yt-dlp is run in the background.
2588 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2589 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2590 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2591 except OSError:
2592 return False
2593 return stdout
2594
2595
2596 def detect_exe_version(output, version_re=None, unrecognized='present'):
2597 assert isinstance(output, compat_str)
2598 if version_re is None:
2599 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2600 m = re.search(version_re, output)
2601 if m:
2602 return m.group(1)
2603 else:
2604 return unrecognized
2605
2606
2607 def get_exe_version(exe, args=['--version'],
2608 version_re=None, unrecognized='present'):
2609 """ Returns the version of the specified executable,
2610 or False if the executable is not present """
2611 out = _get_exe_version_output(exe, args)
2612 return detect_exe_version(out, version_re, unrecognized) if out else False
2613
2614
2615 def frange(start=0, stop=None, step=1):
2616 """Float range"""
2617 if stop is None:
2618 start, stop = 0, start
2619 sign = [-1, 1][step > 0] if step else 0
2620 while sign * start < sign * stop:
2621 yield start
2622 start += step
2623
2624
2625 class LazyList(collections.abc.Sequence):
2626 """Lazy immutable list from an iterable
2627 Note that slices of a LazyList are lists and not LazyList"""
2628
2629 class IndexError(IndexError):
2630 pass
2631
2632 def __init__(self, iterable, *, reverse=False, _cache=None):
2633 self._iterable = iter(iterable)
2634 self._cache = [] if _cache is None else _cache
2635 self._reversed = reverse
2636
2637 def __iter__(self):
2638 if self._reversed:
2639 # We need to consume the entire iterable to iterate in reverse
2640 yield from self.exhaust()
2641 return
2642 yield from self._cache
2643 for item in self._iterable:
2644 self._cache.append(item)
2645 yield item
2646
2647 def _exhaust(self):
2648 self._cache.extend(self._iterable)
2649 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2650 return self._cache
2651
2652 def exhaust(self):
2653 """Evaluate the entire iterable"""
2654 return self._exhaust()[::-1 if self._reversed else 1]
2655
2656 @staticmethod
2657 def _reverse_index(x):
2658 return None if x is None else -(x + 1)
2659
2660 def __getitem__(self, idx):
2661 if isinstance(idx, slice):
2662 if self._reversed:
2663 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2664 start, stop, step = idx.start, idx.stop, idx.step or 1
2665 elif isinstance(idx, int):
2666 if self._reversed:
2667 idx = self._reverse_index(idx)
2668 start, stop, step = idx, idx, 0
2669 else:
2670 raise TypeError('indices must be integers or slices')
2671 if ((start or 0) < 0 or (stop or 0) < 0
2672 or (start is None and step < 0)
2673 or (stop is None and step > 0)):
2674 # We need to consume the entire iterable to be able to slice from the end
2675 # Obviously, never use this with infinite iterables
2676 self._exhaust()
2677 try:
2678 return self._cache[idx]
2679 except IndexError as e:
2680 raise self.IndexError(e) from e
2681 n = max(start or 0, stop or 0) - len(self._cache) + 1
2682 if n > 0:
2683 self._cache.extend(itertools.islice(self._iterable, n))
2684 try:
2685 return self._cache[idx]
2686 except IndexError as e:
2687 raise self.IndexError(e) from e
2688
2689 def __bool__(self):
2690 try:
2691 self[-1] if self._reversed else self[0]
2692 except self.IndexError:
2693 return False
2694 return True
2695
2696 def __len__(self):
2697 self._exhaust()
2698 return len(self._cache)
2699
2700 def __reversed__(self):
2701 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2702
2703 def __copy__(self):
2704 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2705
2706 def __repr__(self):
2707 # repr and str should mimic a list. So we exhaust the iterable
2708 return repr(self.exhaust())
2709
2710 def __str__(self):
2711 return repr(self.exhaust())
2712
2713
2714 class PagedList:
2715
2716 class IndexError(IndexError):
2717 pass
2718
2719 def __len__(self):
2720 # This is only useful for tests
2721 return len(self.getslice())
2722
2723 def __init__(self, pagefunc, pagesize, use_cache=True):
2724 self._pagefunc = pagefunc
2725 self._pagesize = pagesize
2726 self._pagecount = float('inf')
2727 self._use_cache = use_cache
2728 self._cache = {}
2729
2730 def getpage(self, pagenum):
2731 page_results = self._cache.get(pagenum)
2732 if page_results is None:
2733 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2734 if self._use_cache:
2735 self._cache[pagenum] = page_results
2736 return page_results
2737
2738 def getslice(self, start=0, end=None):
2739 return list(self._getslice(start, end))
2740
2741 def _getslice(self, start, end):
2742 raise NotImplementedError('This method must be implemented by subclasses')
2743
2744 def __getitem__(self, idx):
2745 assert self._use_cache, 'Indexing PagedList requires cache'
2746 if not isinstance(idx, int) or idx < 0:
2747 raise TypeError('indices must be non-negative integers')
2748 entries = self.getslice(idx, idx + 1)
2749 if not entries:
2750 raise self.IndexError()
2751 return entries[0]
2752
2753
2754 class OnDemandPagedList(PagedList):
2755 """Download pages until a page with less than maximum results"""
2756
2757 def _getslice(self, start, end):
2758 for pagenum in itertools.count(start // self._pagesize):
2759 firstid = pagenum * self._pagesize
2760 nextfirstid = pagenum * self._pagesize + self._pagesize
2761 if start >= nextfirstid:
2762 continue
2763
2764 startv = (
2765 start % self._pagesize
2766 if firstid <= start < nextfirstid
2767 else 0)
2768 endv = (
2769 ((end - 1) % self._pagesize) + 1
2770 if (end is not None and firstid <= end <= nextfirstid)
2771 else None)
2772
2773 try:
2774 page_results = self.getpage(pagenum)
2775 except Exception:
2776 self._pagecount = pagenum - 1
2777 raise
2778 if startv != 0 or endv is not None:
2779 page_results = page_results[startv:endv]
2780 yield from page_results
2781
2782 # A little optimization - if current page is not "full", ie. does
2783 # not contain page_size videos then we can assume that this page
2784 # is the last one - there are no more ids on further pages -
2785 # i.e. no need to query again.
2786 if len(page_results) + startv < self._pagesize:
2787 break
2788
2789 # If we got the whole page, but the next page is not interesting,
2790 # break out early as well
2791 if end == nextfirstid:
2792 break
2793
2794
2795 class InAdvancePagedList(PagedList):
2796 """PagedList with total number of pages known in advance"""
2797
2798 def __init__(self, pagefunc, pagecount, pagesize):
2799 PagedList.__init__(self, pagefunc, pagesize, True)
2800 self._pagecount = pagecount
2801
2802 def _getslice(self, start, end):
2803 start_page = start // self._pagesize
2804 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2805 skip_elems = start - start_page * self._pagesize
2806 only_more = None if end is None else end - start
2807 for pagenum in range(start_page, end_page):
2808 page_results = self.getpage(pagenum)
2809 if skip_elems:
2810 page_results = page_results[skip_elems:]
2811 skip_elems = None
2812 if only_more is not None:
2813 if len(page_results) < only_more:
2814 only_more -= len(page_results)
2815 else:
2816 yield from page_results[:only_more]
2817 break
2818 yield from page_results
2819
2820
2821 class PlaylistEntries:
2822 MissingEntry = object()
2823 is_exhausted = False
2824
2825 def __init__(self, ydl, info_dict):
2826 self.ydl = ydl
2827
2828 # _entries must be assigned now since infodict can change during iteration
2829 entries = info_dict.get('entries')
2830 if entries is None:
2831 raise EntryNotInPlaylist('There are no entries')
2832 elif isinstance(entries, list):
2833 self.is_exhausted = True
2834
2835 requested_entries = info_dict.get('requested_entries')
2836 self.is_incomplete = bool(requested_entries)
2837 if self.is_incomplete:
2838 assert self.is_exhausted
2839 self._entries = [self.MissingEntry] * max(requested_entries)
2840 for i, entry in zip(requested_entries, entries):
2841 self._entries[i - 1] = entry
2842 elif isinstance(entries, (list, PagedList, LazyList)):
2843 self._entries = entries
2844 else:
2845 self._entries = LazyList(entries)
2846
2847 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2848 (?P<start>[+-]?\d+)?
2849 (?P<range>[:-]
2850 (?P<end>[+-]?\d+|inf(?:inite)?)?
2851 (?::(?P<step>[+-]?\d+))?
2852 )?''')
2853
2854 @classmethod
2855 def parse_playlist_items(cls, string):
2856 for segment in string.split(','):
2857 if not segment:
2858 raise ValueError('There is two or more consecutive commas')
2859 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2860 if not mobj:
2861 raise ValueError(f'{segment!r} is not a valid specification')
2862 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2863 if int_or_none(step) == 0:
2864 raise ValueError(f'Step in {segment!r} cannot be zero')
2865 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2866
2867 def get_requested_items(self):
2868 playlist_items = self.ydl.params.get('playlist_items')
2869 playlist_start = self.ydl.params.get('playliststart', 1)
2870 playlist_end = self.ydl.params.get('playlistend')
2871 # For backwards compatibility, interpret -1 as whole list
2872 if playlist_end in (-1, None):
2873 playlist_end = ''
2874 if not playlist_items:
2875 playlist_items = f'{playlist_start}:{playlist_end}'
2876 elif playlist_start != 1 or playlist_end:
2877 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2878
2879 for index in self.parse_playlist_items(playlist_items):
2880 for i, entry in self[index]:
2881 yield i, entry
2882 try:
2883 # TODO: Add auto-generated fields
2884 self.ydl._match_entry(entry, incomplete=True, silent=True)
2885 except (ExistingVideoReached, RejectedVideoReached):
2886 return
2887
2888 def get_full_count(self):
2889 if self.is_exhausted and not self.is_incomplete:
2890 return len(self)
2891 elif isinstance(self._entries, InAdvancePagedList):
2892 if self._entries._pagesize == 1:
2893 return self._entries._pagecount
2894
2895 @functools.cached_property
2896 def _getter(self):
2897 if isinstance(self._entries, list):
2898 def get_entry(i):
2899 try:
2900 entry = self._entries[i]
2901 except IndexError:
2902 entry = self.MissingEntry
2903 if not self.is_incomplete:
2904 raise self.IndexError()
2905 if entry is self.MissingEntry:
2906 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2907 return entry
2908 else:
2909 def get_entry(i):
2910 try:
2911 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2912 except (LazyList.IndexError, PagedList.IndexError):
2913 raise self.IndexError()
2914 return get_entry
2915
2916 def __getitem__(self, idx):
2917 if isinstance(idx, int):
2918 idx = slice(idx, idx)
2919
2920 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2921 step = 1 if idx.step is None else idx.step
2922 if idx.start is None:
2923 start = 0 if step > 0 else len(self) - 1
2924 else:
2925 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2926
2927 # NB: Do not call len(self) when idx == [:]
2928 if idx.stop is None:
2929 stop = 0 if step < 0 else float('inf')
2930 else:
2931 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2932 stop += [-1, 1][step > 0]
2933
2934 for i in frange(start, stop, step):
2935 if i < 0:
2936 continue
2937 try:
2938 entry = self._getter(i)
2939 except self.IndexError:
2940 self.is_exhausted = True
2941 if step > 0:
2942 break
2943 continue
2944 yield i + 1, entry
2945
2946 def __len__(self):
2947 return len(tuple(self[:]))
2948
2949 class IndexError(IndexError):
2950 pass
2951
2952
2953 def uppercase_escape(s):
2954 unicode_escape = codecs.getdecoder('unicode_escape')
2955 return re.sub(
2956 r'\\U[0-9a-fA-F]{8}',
2957 lambda m: unicode_escape(m.group(0))[0],
2958 s)
2959
2960
2961 def lowercase_escape(s):
2962 unicode_escape = codecs.getdecoder('unicode_escape')
2963 return re.sub(
2964 r'\\u[0-9a-fA-F]{4}',
2965 lambda m: unicode_escape(m.group(0))[0],
2966 s)
2967
2968
2969 def escape_rfc3986(s):
2970 """Escape non-ASCII characters as suggested by RFC 3986"""
2971 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2972
2973
2974 def escape_url(url):
2975 """Escape URL as suggested by RFC 3986"""
2976 url_parsed = compat_urllib_parse_urlparse(url)
2977 return url_parsed._replace(
2978 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2979 path=escape_rfc3986(url_parsed.path),
2980 params=escape_rfc3986(url_parsed.params),
2981 query=escape_rfc3986(url_parsed.query),
2982 fragment=escape_rfc3986(url_parsed.fragment)
2983 ).geturl()
2984
2985
2986 def parse_qs(url):
2987 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2988
2989
2990 def read_batch_urls(batch_fd):
2991 def fixup(url):
2992 if not isinstance(url, compat_str):
2993 url = url.decode('utf-8', 'replace')
2994 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2995 for bom in BOM_UTF8:
2996 if url.startswith(bom):
2997 url = url[len(bom):]
2998 url = url.lstrip()
2999 if not url or url.startswith(('#', ';', ']')):
3000 return False
3001 # "#" cannot be stripped out since it is part of the URI
3002 # However, it can be safely stipped out if follwing a whitespace
3003 return re.split(r'\s#', url, 1)[0].rstrip()
3004
3005 with contextlib.closing(batch_fd) as fd:
3006 return [url for url in map(fixup, fd) if url]
3007
3008
3009 def urlencode_postdata(*args, **kargs):
3010 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
3011
3012
3013 def update_url_query(url, query):
3014 if not query:
3015 return url
3016 parsed_url = compat_urlparse.urlparse(url)
3017 qs = compat_parse_qs(parsed_url.query)
3018 qs.update(query)
3019 return compat_urlparse.urlunparse(parsed_url._replace(
3020 query=compat_urllib_parse_urlencode(qs, True)))
3021
3022
3023 def update_Request(req, url=None, data=None, headers={}, query={}):
3024 req_headers = req.headers.copy()
3025 req_headers.update(headers)
3026 req_data = data or req.data
3027 req_url = update_url_query(url or req.get_full_url(), query)
3028 req_get_method = req.get_method()
3029 if req_get_method == 'HEAD':
3030 req_type = HEADRequest
3031 elif req_get_method == 'PUT':
3032 req_type = PUTRequest
3033 else:
3034 req_type = compat_urllib_request.Request
3035 new_req = req_type(
3036 req_url, data=req_data, headers=req_headers,
3037 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3038 if hasattr(req, 'timeout'):
3039 new_req.timeout = req.timeout
3040 return new_req
3041
3042
3043 def _multipart_encode_impl(data, boundary):
3044 content_type = 'multipart/form-data; boundary=%s' % boundary
3045
3046 out = b''
3047 for k, v in data.items():
3048 out += b'--' + boundary.encode('ascii') + b'\r\n'
3049 if isinstance(k, compat_str):
3050 k = k.encode()
3051 if isinstance(v, compat_str):
3052 v = v.encode()
3053 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3054 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3055 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3056 if boundary.encode('ascii') in content:
3057 raise ValueError('Boundary overlaps with data')
3058 out += content
3059
3060 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3061
3062 return out, content_type
3063
3064
3065 def multipart_encode(data, boundary=None):
3066 '''
3067 Encode a dict to RFC 7578-compliant form-data
3068
3069 data:
3070 A dict where keys and values can be either Unicode or bytes-like
3071 objects.
3072 boundary:
3073 If specified a Unicode object, it's used as the boundary. Otherwise
3074 a random boundary is generated.
3075
3076 Reference: https://tools.ietf.org/html/rfc7578
3077 '''
3078 has_specified_boundary = boundary is not None
3079
3080 while True:
3081 if boundary is None:
3082 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3083
3084 try:
3085 out, content_type = _multipart_encode_impl(data, boundary)
3086 break
3087 except ValueError:
3088 if has_specified_boundary:
3089 raise
3090 boundary = None
3091
3092 return out, content_type
3093
3094
3095 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3096 for val in map(d.get, variadic(key_or_keys)):
3097 if val is not None and (val or not skip_false_values):
3098 return val
3099 return default
3100
3101
3102 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3103 for f in funcs:
3104 try:
3105 val = f(*args, **kwargs)
3106 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3107 pass
3108 else:
3109 if expected_type is None or isinstance(val, expected_type):
3110 return val
3111
3112
3113 def try_get(src, getter, expected_type=None):
3114 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3115
3116
3117 def filter_dict(dct, cndn=lambda _, v: v is not None):
3118 return {k: v for k, v in dct.items() if cndn(k, v)}
3119
3120
3121 def merge_dicts(*dicts):
3122 merged = {}
3123 for a_dict in dicts:
3124 for k, v in a_dict.items():
3125 if (v is not None and k not in merged
3126 or isinstance(v, str) and merged[k] == ''):
3127 merged[k] = v
3128 return merged
3129
3130
3131 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3132 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3133
3134
3135 US_RATINGS = {
3136 'G': 0,
3137 'PG': 10,
3138 'PG-13': 13,
3139 'R': 16,
3140 'NC': 18,
3141 }
3142
3143
3144 TV_PARENTAL_GUIDELINES = {
3145 'TV-Y': 0,
3146 'TV-Y7': 7,
3147 'TV-G': 0,
3148 'TV-PG': 0,
3149 'TV-14': 14,
3150 'TV-MA': 17,
3151 }
3152
3153
3154 def parse_age_limit(s):
3155 # isinstance(False, int) is True. So type() must be used instead
3156 if type(s) is int: # noqa: E721
3157 return s if 0 <= s <= 21 else None
3158 elif not isinstance(s, str):
3159 return None
3160 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3161 if m:
3162 return int(m.group('age'))
3163 s = s.upper()
3164 if s in US_RATINGS:
3165 return US_RATINGS[s]
3166 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3167 if m:
3168 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3169 return None
3170
3171
3172 def strip_jsonp(code):
3173 return re.sub(
3174 r'''(?sx)^
3175 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3176 (?:\s*&&\s*(?P=func_name))?
3177 \s*\(\s*(?P<callback_data>.*)\);?
3178 \s*?(?://[^\n]*)*$''',
3179 r'\g<callback_data>', code)
3180
3181
3182 def js_to_json(code, vars={}):
3183 # vars is a dict of var, val pairs to substitute
3184 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3185 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3186 INTEGER_TABLE = (
3187 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3188 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3189 )
3190
3191 def fix_kv(m):
3192 v = m.group(0)
3193 if v in ('true', 'false', 'null'):
3194 return v
3195 elif v in ('undefined', 'void 0'):
3196 return 'null'
3197 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3198 return ""
3199
3200 if v[0] in ("'", '"'):
3201 v = re.sub(r'(?s)\\.|"', lambda m: {
3202 '"': '\\"',
3203 "\\'": "'",
3204 '\\\n': '',
3205 '\\x': '\\u00',
3206 }.get(m.group(0), m.group(0)), v[1:-1])
3207 else:
3208 for regex, base in INTEGER_TABLE:
3209 im = re.match(regex, v)
3210 if im:
3211 i = int(im.group(1), base)
3212 return '"%d":' % i if v.endswith(':') else '%d' % i
3213
3214 if v in vars:
3215 return vars[v]
3216
3217 return '"%s"' % v
3218
3219 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3220
3221 return re.sub(r'''(?sx)
3222 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3223 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3224 {comment}|,(?={skip}[\]}}])|
3225 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3226 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3227 [0-9]+(?={skip}:)|
3228 !+
3229 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3230
3231
3232 def qualities(quality_ids):
3233 """ Get a numeric quality value out of a list of possible values """
3234 def q(qid):
3235 try:
3236 return quality_ids.index(qid)
3237 except ValueError:
3238 return -1
3239 return q
3240
3241
3242 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3243
3244
3245 DEFAULT_OUTTMPL = {
3246 'default': '%(title)s [%(id)s].%(ext)s',
3247 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3248 }
3249 OUTTMPL_TYPES = {
3250 'chapter': None,
3251 'subtitle': None,
3252 'thumbnail': None,
3253 'description': 'description',
3254 'annotation': 'annotations.xml',
3255 'infojson': 'info.json',
3256 'link': None,
3257 'pl_video': None,
3258 'pl_thumbnail': None,
3259 'pl_description': 'description',
3260 'pl_infojson': 'info.json',
3261 }
3262
3263 # As of [1] format syntax is:
3264 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3265 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3266 STR_FORMAT_RE_TMPL = r'''(?x)
3267 (?<!%)(?P<prefix>(?:%%)*)
3268 %
3269 (?P<has_key>\((?P<key>{0})\))?
3270 (?P<format>
3271 (?P<conversion>[#0\-+ ]+)?
3272 (?P<min_width>\d+)?
3273 (?P<precision>\.\d+)?
3274 (?P<len_mod>[hlL])? # unused in python
3275 {1} # conversion type
3276 )
3277 '''
3278
3279
3280 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3281
3282
3283 def limit_length(s, length):
3284 """ Add ellipses to overly long strings """
3285 if s is None:
3286 return None
3287 ELLIPSES = '...'
3288 if len(s) > length:
3289 return s[:length - len(ELLIPSES)] + ELLIPSES
3290 return s
3291
3292
3293 def version_tuple(v):
3294 return tuple(int(e) for e in re.split(r'[-.]', v))
3295
3296
3297 def is_outdated_version(version, limit, assume_new=True):
3298 if not version:
3299 return not assume_new
3300 try:
3301 return version_tuple(version) < version_tuple(limit)
3302 except ValueError:
3303 return not assume_new
3304
3305
3306 def ytdl_is_updateable():
3307 """ Returns if yt-dlp can be updated with -U """
3308
3309 from .update import is_non_updateable
3310
3311 return not is_non_updateable()
3312
3313
3314 def args_to_str(args):
3315 # Get a short string representation for a subprocess command
3316 return ' '.join(compat_shlex_quote(a) for a in args)
3317
3318
3319 def error_to_compat_str(err):
3320 return str(err)
3321
3322
3323 def error_to_str(err):
3324 return f'{type(err).__name__}: {err}'
3325
3326
3327 def mimetype2ext(mt):
3328 if mt is None:
3329 return None
3330
3331 mt, _, params = mt.partition(';')
3332 mt = mt.strip()
3333
3334 FULL_MAP = {
3335 'audio/mp4': 'm4a',
3336 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3337 # it's the most popular one
3338 'audio/mpeg': 'mp3',
3339 'audio/x-wav': 'wav',
3340 'audio/wav': 'wav',
3341 'audio/wave': 'wav',
3342 }
3343
3344 ext = FULL_MAP.get(mt)
3345 if ext is not None:
3346 return ext
3347
3348 SUBTYPE_MAP = {
3349 '3gpp': '3gp',
3350 'smptett+xml': 'tt',
3351 'ttaf+xml': 'dfxp',
3352 'ttml+xml': 'ttml',
3353 'x-flv': 'flv',
3354 'x-mp4-fragmented': 'mp4',
3355 'x-ms-sami': 'sami',
3356 'x-ms-wmv': 'wmv',
3357 'mpegurl': 'm3u8',
3358 'x-mpegurl': 'm3u8',
3359 'vnd.apple.mpegurl': 'm3u8',
3360 'dash+xml': 'mpd',
3361 'f4m+xml': 'f4m',
3362 'hds+xml': 'f4m',
3363 'vnd.ms-sstr+xml': 'ism',
3364 'quicktime': 'mov',
3365 'mp2t': 'ts',
3366 'x-wav': 'wav',
3367 'filmstrip+json': 'fs',
3368 'svg+xml': 'svg',
3369 }
3370
3371 _, _, subtype = mt.rpartition('/')
3372 ext = SUBTYPE_MAP.get(subtype.lower())
3373 if ext is not None:
3374 return ext
3375
3376 SUFFIX_MAP = {
3377 'json': 'json',
3378 'xml': 'xml',
3379 'zip': 'zip',
3380 'gzip': 'gz',
3381 }
3382
3383 _, _, suffix = subtype.partition('+')
3384 ext = SUFFIX_MAP.get(suffix)
3385 if ext is not None:
3386 return ext
3387
3388 return subtype.replace('+', '.')
3389
3390
3391 def ext2mimetype(ext_or_url):
3392 if not ext_or_url:
3393 return None
3394 if '.' not in ext_or_url:
3395 ext_or_url = f'file.{ext_or_url}'
3396 return mimetypes.guess_type(ext_or_url)[0]
3397
3398
3399 def parse_codecs(codecs_str):
3400 # http://tools.ietf.org/html/rfc6381
3401 if not codecs_str:
3402 return {}
3403 split_codecs = list(filter(None, map(
3404 str.strip, codecs_str.strip().strip(',').split(','))))
3405 vcodec, acodec, scodec, hdr = None, None, None, None
3406 for full_codec in split_codecs:
3407 parts = full_codec.split('.')
3408 codec = parts[0].replace('0', '')
3409 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3410 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3411 if not vcodec:
3412 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3413 if codec in ('dvh1', 'dvhe'):
3414 hdr = 'DV'
3415 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3416 hdr = 'HDR10'
3417 elif full_codec.replace('0', '').startswith('vp9.2'):
3418 hdr = 'HDR10'
3419 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3420 if not acodec:
3421 acodec = full_codec
3422 elif codec in ('stpp', 'wvtt',):
3423 if not scodec:
3424 scodec = full_codec
3425 else:
3426 write_string(f'WARNING: Unknown codec {full_codec}\n')
3427 if vcodec or acodec or scodec:
3428 return {
3429 'vcodec': vcodec or 'none',
3430 'acodec': acodec or 'none',
3431 'dynamic_range': hdr,
3432 **({'scodec': scodec} if scodec is not None else {}),
3433 }
3434 elif len(split_codecs) == 2:
3435 return {
3436 'vcodec': split_codecs[0],
3437 'acodec': split_codecs[1],
3438 }
3439 return {}
3440
3441
3442 def urlhandle_detect_ext(url_handle):
3443 getheader = url_handle.headers.get
3444
3445 cd = getheader('Content-Disposition')
3446 if cd:
3447 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3448 if m:
3449 e = determine_ext(m.group('filename'), default_ext=None)
3450 if e:
3451 return e
3452
3453 return mimetype2ext(getheader('Content-Type'))
3454
3455
3456 def encode_data_uri(data, mime_type):
3457 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3458
3459
3460 def age_restricted(content_limit, age_limit):
3461 """ Returns True iff the content should be blocked """
3462
3463 if age_limit is None: # No limit set
3464 return False
3465 if content_limit is None:
3466 return False # Content available for everyone
3467 return age_limit < content_limit
3468
3469
3470 def is_html(first_bytes):
3471 """ Detect whether a file contains HTML by examining its first bytes. """
3472
3473 BOMS = [
3474 (b'\xef\xbb\xbf', 'utf-8'),
3475 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3476 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3477 (b'\xff\xfe', 'utf-16-le'),
3478 (b'\xfe\xff', 'utf-16-be'),
3479 ]
3480
3481 encoding = 'utf-8'
3482 for bom, enc in BOMS:
3483 while first_bytes.startswith(bom):
3484 encoding, first_bytes = enc, first_bytes[len(bom):]
3485
3486 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3487
3488
3489 def determine_protocol(info_dict):
3490 protocol = info_dict.get('protocol')
3491 if protocol is not None:
3492 return protocol
3493
3494 url = sanitize_url(info_dict['url'])
3495 if url.startswith('rtmp'):
3496 return 'rtmp'
3497 elif url.startswith('mms'):
3498 return 'mms'
3499 elif url.startswith('rtsp'):
3500 return 'rtsp'
3501
3502 ext = determine_ext(url)
3503 if ext == 'm3u8':
3504 return 'm3u8'
3505 elif ext == 'f4m':
3506 return 'f4m'
3507
3508 return compat_urllib_parse_urlparse(url).scheme
3509
3510
3511 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3512 """ Render a list of rows, each as a list of values.
3513 Text after a \t will be right aligned """
3514 def width(string):
3515 return len(remove_terminal_sequences(string).replace('\t', ''))
3516
3517 def get_max_lens(table):
3518 return [max(width(str(v)) for v in col) for col in zip(*table)]
3519
3520 def filter_using_list(row, filterArray):
3521 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3522
3523 max_lens = get_max_lens(data) if hide_empty else []
3524 header_row = filter_using_list(header_row, max_lens)
3525 data = [filter_using_list(row, max_lens) for row in data]
3526
3527 table = [header_row] + data
3528 max_lens = get_max_lens(table)
3529 extra_gap += 1
3530 if delim:
3531 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3532 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3533 for row in table:
3534 for pos, text in enumerate(map(str, row)):
3535 if '\t' in text:
3536 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3537 else:
3538 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3539 ret = '\n'.join(''.join(row).rstrip() for row in table)
3540 return ret
3541
3542
3543 def _match_one(filter_part, dct, incomplete):
3544 # TODO: Generalize code with YoutubeDL._build_format_filter
3545 STRING_OPERATORS = {
3546 '*=': operator.contains,
3547 '^=': lambda attr, value: attr.startswith(value),
3548 '$=': lambda attr, value: attr.endswith(value),
3549 '~=': lambda attr, value: re.search(value, attr),
3550 }
3551 COMPARISON_OPERATORS = {
3552 **STRING_OPERATORS,
3553 '<=': operator.le, # "<=" must be defined above "<"
3554 '<': operator.lt,
3555 '>=': operator.ge,
3556 '>': operator.gt,
3557 '=': operator.eq,
3558 }
3559
3560 if isinstance(incomplete, bool):
3561 is_incomplete = lambda _: incomplete
3562 else:
3563 is_incomplete = lambda k: k in incomplete
3564
3565 operator_rex = re.compile(r'''(?x)
3566 (?P<key>[a-z_]+)
3567 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3568 (?:
3569 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3570 (?P<strval>.+?)
3571 )
3572 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3573 m = operator_rex.fullmatch(filter_part.strip())
3574 if m:
3575 m = m.groupdict()
3576 unnegated_op = COMPARISON_OPERATORS[m['op']]
3577 if m['negation']:
3578 op = lambda attr, value: not unnegated_op(attr, value)
3579 else:
3580 op = unnegated_op
3581 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3582 if m['quote']:
3583 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3584 actual_value = dct.get(m['key'])
3585 numeric_comparison = None
3586 if isinstance(actual_value, (int, float)):
3587 # If the original field is a string and matching comparisonvalue is
3588 # a number we should respect the origin of the original field
3589 # and process comparison value as a string (see
3590 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3591 try:
3592 numeric_comparison = int(comparison_value)
3593 except ValueError:
3594 numeric_comparison = parse_filesize(comparison_value)
3595 if numeric_comparison is None:
3596 numeric_comparison = parse_filesize(f'{comparison_value}B')
3597 if numeric_comparison is None:
3598 numeric_comparison = parse_duration(comparison_value)
3599 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3600 raise ValueError('Operator %s only supports string values!' % m['op'])
3601 if actual_value is None:
3602 return is_incomplete(m['key']) or m['none_inclusive']
3603 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3604
3605 UNARY_OPERATORS = {
3606 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3607 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3608 }
3609 operator_rex = re.compile(r'''(?x)
3610 (?P<op>%s)\s*(?P<key>[a-z_]+)
3611 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3612 m = operator_rex.fullmatch(filter_part.strip())
3613 if m:
3614 op = UNARY_OPERATORS[m.group('op')]
3615 actual_value = dct.get(m.group('key'))
3616 if is_incomplete(m.group('key')) and actual_value is None:
3617 return True
3618 return op(actual_value)
3619
3620 raise ValueError('Invalid filter part %r' % filter_part)
3621
3622
3623 def match_str(filter_str, dct, incomplete=False):
3624 """ Filter a dictionary with a simple string syntax.
3625 @returns Whether the filter passes
3626 @param incomplete Set of keys that is expected to be missing from dct.
3627 Can be True/False to indicate all/none of the keys may be missing.
3628 All conditions on incomplete keys pass if the key is missing
3629 """
3630 return all(
3631 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3632 for filter_part in re.split(r'(?<!\\)&', filter_str))
3633
3634
3635 def match_filter_func(filters):
3636 if not filters:
3637 return None
3638 filters = set(variadic(filters))
3639
3640 interactive = '-' in filters
3641 if interactive:
3642 filters.remove('-')
3643
3644 def _match_func(info_dict, incomplete=False):
3645 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3646 return NO_DEFAULT if interactive and not incomplete else None
3647 else:
3648 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3649 filter_str = ') | ('.join(map(str.strip, filters))
3650 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3651 return _match_func
3652
3653
3654 def download_range_func(chapters, ranges):
3655 def inner(info_dict, ydl):
3656 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3657 else 'Cannot match chapters since chapter information is unavailable')
3658 for regex in chapters or []:
3659 for i, chapter in enumerate(info_dict.get('chapters') or []):
3660 if re.search(regex, chapter['title']):
3661 warning = None
3662 yield {**chapter, 'index': i}
3663 if chapters and warning:
3664 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3665
3666 yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3667
3668 return inner
3669
3670
3671 def parse_dfxp_time_expr(time_expr):
3672 if not time_expr:
3673 return
3674
3675 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3676 if mobj:
3677 return float(mobj.group('time_offset'))
3678
3679 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3680 if mobj:
3681 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3682
3683
3684 def srt_subtitles_timecode(seconds):
3685 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3686
3687
3688 def ass_subtitles_timecode(seconds):
3689 time = timetuple_from_msec(seconds * 1000)
3690 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3691
3692
3693 def dfxp2srt(dfxp_data):
3694 '''
3695 @param dfxp_data A bytes-like object containing DFXP data
3696 @returns A unicode object containing converted SRT data
3697 '''
3698 LEGACY_NAMESPACES = (
3699 (b'http://www.w3.org/ns/ttml', [
3700 b'http://www.w3.org/2004/11/ttaf1',
3701 b'http://www.w3.org/2006/04/ttaf1',
3702 b'http://www.w3.org/2006/10/ttaf1',
3703 ]),
3704 (b'http://www.w3.org/ns/ttml#styling', [
3705 b'http://www.w3.org/ns/ttml#style',
3706 ]),
3707 )
3708
3709 SUPPORTED_STYLING = [
3710 'color',
3711 'fontFamily',
3712 'fontSize',
3713 'fontStyle',
3714 'fontWeight',
3715 'textDecoration'
3716 ]
3717
3718 _x = functools.partial(xpath_with_ns, ns_map={
3719 'xml': 'http://www.w3.org/XML/1998/namespace',
3720 'ttml': 'http://www.w3.org/ns/ttml',
3721 'tts': 'http://www.w3.org/ns/ttml#styling',
3722 })
3723
3724 styles = {}
3725 default_style = {}
3726
3727 class TTMLPElementParser:
3728 _out = ''
3729 _unclosed_elements = []
3730 _applied_styles = []
3731
3732 def start(self, tag, attrib):
3733 if tag in (_x('ttml:br'), 'br'):
3734 self._out += '\n'
3735 else:
3736 unclosed_elements = []
3737 style = {}
3738 element_style_id = attrib.get('style')
3739 if default_style:
3740 style.update(default_style)
3741 if element_style_id:
3742 style.update(styles.get(element_style_id, {}))
3743 for prop in SUPPORTED_STYLING:
3744 prop_val = attrib.get(_x('tts:' + prop))
3745 if prop_val:
3746 style[prop] = prop_val
3747 if style:
3748 font = ''
3749 for k, v in sorted(style.items()):
3750 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3751 continue
3752 if k == 'color':
3753 font += ' color="%s"' % v
3754 elif k == 'fontSize':
3755 font += ' size="%s"' % v
3756 elif k == 'fontFamily':
3757 font += ' face="%s"' % v
3758 elif k == 'fontWeight' and v == 'bold':
3759 self._out += '<b>'
3760 unclosed_elements.append('b')
3761 elif k == 'fontStyle' and v == 'italic':
3762 self._out += '<i>'
3763 unclosed_elements.append('i')
3764 elif k == 'textDecoration' and v == 'underline':
3765 self._out += '<u>'
3766 unclosed_elements.append('u')
3767 if font:
3768 self._out += '<font' + font + '>'
3769 unclosed_elements.append('font')
3770 applied_style = {}
3771 if self._applied_styles:
3772 applied_style.update(self._applied_styles[-1])
3773 applied_style.update(style)
3774 self._applied_styles.append(applied_style)
3775 self._unclosed_elements.append(unclosed_elements)
3776
3777 def end(self, tag):
3778 if tag not in (_x('ttml:br'), 'br'):
3779 unclosed_elements = self._unclosed_elements.pop()
3780 for element in reversed(unclosed_elements):
3781 self._out += '</%s>' % element
3782 if unclosed_elements and self._applied_styles:
3783 self._applied_styles.pop()
3784
3785 def data(self, data):
3786 self._out += data
3787
3788 def close(self):
3789 return self._out.strip()
3790
3791 def parse_node(node):
3792 target = TTMLPElementParser()
3793 parser = xml.etree.ElementTree.XMLParser(target=target)
3794 parser.feed(xml.etree.ElementTree.tostring(node))
3795 return parser.close()
3796
3797 for k, v in LEGACY_NAMESPACES:
3798 for ns in v:
3799 dfxp_data = dfxp_data.replace(ns, k)
3800
3801 dfxp = compat_etree_fromstring(dfxp_data)
3802 out = []
3803 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3804
3805 if not paras:
3806 raise ValueError('Invalid dfxp/TTML subtitle')
3807
3808 repeat = False
3809 while True:
3810 for style in dfxp.findall(_x('.//ttml:style')):
3811 style_id = style.get('id') or style.get(_x('xml:id'))
3812 if not style_id:
3813 continue
3814 parent_style_id = style.get('style')
3815 if parent_style_id:
3816 if parent_style_id not in styles:
3817 repeat = True
3818 continue
3819 styles[style_id] = styles[parent_style_id].copy()
3820 for prop in SUPPORTED_STYLING:
3821 prop_val = style.get(_x('tts:' + prop))
3822 if prop_val:
3823 styles.setdefault(style_id, {})[prop] = prop_val
3824 if repeat:
3825 repeat = False
3826 else:
3827 break
3828
3829 for p in ('body', 'div'):
3830 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3831 if ele is None:
3832 continue
3833 style = styles.get(ele.get('style'))
3834 if not style:
3835 continue
3836 default_style.update(style)
3837
3838 for para, index in zip(paras, itertools.count(1)):
3839 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3840 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3841 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3842 if begin_time is None:
3843 continue
3844 if not end_time:
3845 if not dur:
3846 continue
3847 end_time = begin_time + dur
3848 out.append('%d\n%s --> %s\n%s\n\n' % (
3849 index,
3850 srt_subtitles_timecode(begin_time),
3851 srt_subtitles_timecode(end_time),
3852 parse_node(para)))
3853
3854 return ''.join(out)
3855
3856
3857 def cli_option(params, command_option, param, separator=None):
3858 param = params.get(param)
3859 return ([] if param is None
3860 else [command_option, str(param)] if separator is None
3861 else [f'{command_option}{separator}{param}'])
3862
3863
3864 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3865 param = params.get(param)
3866 assert param in (True, False, None)
3867 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3868
3869
3870 def cli_valueless_option(params, command_option, param, expected_value=True):
3871 return [command_option] if params.get(param) == expected_value else []
3872
3873
3874 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3875 if isinstance(argdict, (list, tuple)): # for backward compatibility
3876 if use_compat:
3877 return argdict
3878 else:
3879 argdict = None
3880 if argdict is None:
3881 return default
3882 assert isinstance(argdict, dict)
3883
3884 assert isinstance(keys, (list, tuple))
3885 for key_list in keys:
3886 arg_list = list(filter(
3887 lambda x: x is not None,
3888 [argdict.get(key.lower()) for key in variadic(key_list)]))
3889 if arg_list:
3890 return [arg for args in arg_list for arg in args]
3891 return default
3892
3893
3894 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3895 main_key, exe = main_key.lower(), exe.lower()
3896 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3897 keys = [f'{root_key}{k}' for k in (keys or [''])]
3898 if root_key in keys:
3899 if main_key != exe:
3900 keys.append((main_key, exe))
3901 keys.append('default')
3902 else:
3903 use_compat = False
3904 return cli_configuration_args(argdict, keys, default, use_compat)
3905
3906
3907 class ISO639Utils:
3908 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3909 _lang_map = {
3910 'aa': 'aar',
3911 'ab': 'abk',
3912 'ae': 'ave',
3913 'af': 'afr',
3914 'ak': 'aka',
3915 'am': 'amh',
3916 'an': 'arg',
3917 'ar': 'ara',
3918 'as': 'asm',
3919 'av': 'ava',
3920 'ay': 'aym',
3921 'az': 'aze',
3922 'ba': 'bak',
3923 'be': 'bel',
3924 'bg': 'bul',
3925 'bh': 'bih',
3926 'bi': 'bis',
3927 'bm': 'bam',
3928 'bn': 'ben',
3929 'bo': 'bod',
3930 'br': 'bre',
3931 'bs': 'bos',
3932 'ca': 'cat',
3933 'ce': 'che',
3934 'ch': 'cha',
3935 'co': 'cos',
3936 'cr': 'cre',
3937 'cs': 'ces',
3938 'cu': 'chu',
3939 'cv': 'chv',
3940 'cy': 'cym',
3941 'da': 'dan',
3942 'de': 'deu',
3943 'dv': 'div',
3944 'dz': 'dzo',
3945 'ee': 'ewe',
3946 'el': 'ell',
3947 'en': 'eng',
3948 'eo': 'epo',
3949 'es': 'spa',
3950 'et': 'est',
3951 'eu': 'eus',
3952 'fa': 'fas',
3953 'ff': 'ful',
3954 'fi': 'fin',
3955 'fj': 'fij',
3956 'fo': 'fao',
3957 'fr': 'fra',
3958 'fy': 'fry',
3959 'ga': 'gle',
3960 'gd': 'gla',
3961 'gl': 'glg',
3962 'gn': 'grn',
3963 'gu': 'guj',
3964 'gv': 'glv',
3965 'ha': 'hau',
3966 'he': 'heb',
3967 'iw': 'heb', # Replaced by he in 1989 revision
3968 'hi': 'hin',
3969 'ho': 'hmo',
3970 'hr': 'hrv',
3971 'ht': 'hat',
3972 'hu': 'hun',
3973 'hy': 'hye',
3974 'hz': 'her',
3975 'ia': 'ina',
3976 'id': 'ind',
3977 'in': 'ind', # Replaced by id in 1989 revision
3978 'ie': 'ile',
3979 'ig': 'ibo',
3980 'ii': 'iii',
3981 'ik': 'ipk',
3982 'io': 'ido',
3983 'is': 'isl',
3984 'it': 'ita',
3985 'iu': 'iku',
3986 'ja': 'jpn',
3987 'jv': 'jav',
3988 'ka': 'kat',
3989 'kg': 'kon',
3990 'ki': 'kik',
3991 'kj': 'kua',
3992 'kk': 'kaz',
3993 'kl': 'kal',
3994 'km': 'khm',
3995 'kn': 'kan',
3996 'ko': 'kor',
3997 'kr': 'kau',
3998 'ks': 'kas',
3999 'ku': 'kur',
4000 'kv': 'kom',
4001 'kw': 'cor',
4002 'ky': 'kir',
4003 'la': 'lat',
4004 'lb': 'ltz',
4005 'lg': 'lug',
4006 'li': 'lim',
4007 'ln': 'lin',
4008 'lo': 'lao',
4009 'lt': 'lit',
4010 'lu': 'lub',
4011 'lv': 'lav',
4012 'mg': 'mlg',
4013 'mh': 'mah',
4014 'mi': 'mri',
4015 'mk': 'mkd',
4016 'ml': 'mal',
4017 'mn': 'mon',
4018 'mr': 'mar',
4019 'ms': 'msa',
4020 'mt': 'mlt',
4021 'my': 'mya',
4022 'na': 'nau',
4023 'nb': 'nob',
4024 'nd': 'nde',
4025 'ne': 'nep',
4026 'ng': 'ndo',
4027 'nl': 'nld',
4028 'nn': 'nno',
4029 'no': 'nor',
4030 'nr': 'nbl',
4031 'nv': 'nav',
4032 'ny': 'nya',
4033 'oc': 'oci',
4034 'oj': 'oji',
4035 'om': 'orm',
4036 'or': 'ori',
4037 'os': 'oss',
4038 'pa': 'pan',
4039 'pi': 'pli',
4040 'pl': 'pol',
4041 'ps': 'pus',
4042 'pt': 'por',
4043 'qu': 'que',
4044 'rm': 'roh',
4045 'rn': 'run',
4046 'ro': 'ron',
4047 'ru': 'rus',
4048 'rw': 'kin',
4049 'sa': 'san',
4050 'sc': 'srd',
4051 'sd': 'snd',
4052 'se': 'sme',
4053 'sg': 'sag',
4054 'si': 'sin',
4055 'sk': 'slk',
4056 'sl': 'slv',
4057 'sm': 'smo',
4058 'sn': 'sna',
4059 'so': 'som',
4060 'sq': 'sqi',
4061 'sr': 'srp',
4062 'ss': 'ssw',
4063 'st': 'sot',
4064 'su': 'sun',
4065 'sv': 'swe',
4066 'sw': 'swa',
4067 'ta': 'tam',
4068 'te': 'tel',
4069 'tg': 'tgk',
4070 'th': 'tha',
4071 'ti': 'tir',
4072 'tk': 'tuk',
4073 'tl': 'tgl',
4074 'tn': 'tsn',
4075 'to': 'ton',
4076 'tr': 'tur',
4077 'ts': 'tso',
4078 'tt': 'tat',
4079 'tw': 'twi',
4080 'ty': 'tah',
4081 'ug': 'uig',
4082 'uk': 'ukr',
4083 'ur': 'urd',
4084 'uz': 'uzb',
4085 've': 'ven',
4086 'vi': 'vie',
4087 'vo': 'vol',
4088 'wa': 'wln',
4089 'wo': 'wol',
4090 'xh': 'xho',
4091 'yi': 'yid',
4092 'ji': 'yid', # Replaced by yi in 1989 revision
4093 'yo': 'yor',
4094 'za': 'zha',
4095 'zh': 'zho',
4096 'zu': 'zul',
4097 }
4098
4099 @classmethod
4100 def short2long(cls, code):
4101 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4102 return cls._lang_map.get(code[:2])
4103
4104 @classmethod
4105 def long2short(cls, code):
4106 """Convert language code from ISO 639-2/T to ISO 639-1"""
4107 for short_name, long_name in cls._lang_map.items():
4108 if long_name == code:
4109 return short_name
4110
4111
4112 class ISO3166Utils:
4113 # From http://data.okfn.org/data/core/country-list
4114 _country_map = {
4115 'AF': 'Afghanistan',
4116 'AX': 'Åland Islands',
4117 'AL': 'Albania',
4118 'DZ': 'Algeria',
4119 'AS': 'American Samoa',
4120 'AD': 'Andorra',
4121 'AO': 'Angola',
4122 'AI': 'Anguilla',
4123 'AQ': 'Antarctica',
4124 'AG': 'Antigua and Barbuda',
4125 'AR': 'Argentina',
4126 'AM': 'Armenia',
4127 'AW': 'Aruba',
4128 'AU': 'Australia',
4129 'AT': 'Austria',
4130 'AZ': 'Azerbaijan',
4131 'BS': 'Bahamas',
4132 'BH': 'Bahrain',
4133 'BD': 'Bangladesh',
4134 'BB': 'Barbados',
4135 'BY': 'Belarus',
4136 'BE': 'Belgium',
4137 'BZ': 'Belize',
4138 'BJ': 'Benin',
4139 'BM': 'Bermuda',
4140 'BT': 'Bhutan',
4141 'BO': 'Bolivia, Plurinational State of',
4142 'BQ': 'Bonaire, Sint Eustatius and Saba',
4143 'BA': 'Bosnia and Herzegovina',
4144 'BW': 'Botswana',
4145 'BV': 'Bouvet Island',
4146 'BR': 'Brazil',
4147 'IO': 'British Indian Ocean Territory',
4148 'BN': 'Brunei Darussalam',
4149 'BG': 'Bulgaria',
4150 'BF': 'Burkina Faso',
4151 'BI': 'Burundi',
4152 'KH': 'Cambodia',
4153 'CM': 'Cameroon',
4154 'CA': 'Canada',
4155 'CV': 'Cape Verde',
4156 'KY': 'Cayman Islands',
4157 'CF': 'Central African Republic',
4158 'TD': 'Chad',
4159 'CL': 'Chile',
4160 'CN': 'China',
4161 'CX': 'Christmas Island',
4162 'CC': 'Cocos (Keeling) Islands',
4163 'CO': 'Colombia',
4164 'KM': 'Comoros',
4165 'CG': 'Congo',
4166 'CD': 'Congo, the Democratic Republic of the',
4167 'CK': 'Cook Islands',
4168 'CR': 'Costa Rica',
4169 'CI': 'Côte d\'Ivoire',
4170 'HR': 'Croatia',
4171 'CU': 'Cuba',
4172 'CW': 'Curaçao',
4173 'CY': 'Cyprus',
4174 'CZ': 'Czech Republic',
4175 'DK': 'Denmark',
4176 'DJ': 'Djibouti',
4177 'DM': 'Dominica',
4178 'DO': 'Dominican Republic',
4179 'EC': 'Ecuador',
4180 'EG': 'Egypt',
4181 'SV': 'El Salvador',
4182 'GQ': 'Equatorial Guinea',
4183 'ER': 'Eritrea',
4184 'EE': 'Estonia',
4185 'ET': 'Ethiopia',
4186 'FK': 'Falkland Islands (Malvinas)',
4187 'FO': 'Faroe Islands',
4188 'FJ': 'Fiji',
4189 'FI': 'Finland',
4190 'FR': 'France',
4191 'GF': 'French Guiana',
4192 'PF': 'French Polynesia',
4193 'TF': 'French Southern Territories',
4194 'GA': 'Gabon',
4195 'GM': 'Gambia',
4196 'GE': 'Georgia',
4197 'DE': 'Germany',
4198 'GH': 'Ghana',
4199 'GI': 'Gibraltar',
4200 'GR': 'Greece',
4201 'GL': 'Greenland',
4202 'GD': 'Grenada',
4203 'GP': 'Guadeloupe',
4204 'GU': 'Guam',
4205 'GT': 'Guatemala',
4206 'GG': 'Guernsey',
4207 'GN': 'Guinea',
4208 'GW': 'Guinea-Bissau',
4209 'GY': 'Guyana',
4210 'HT': 'Haiti',
4211 'HM': 'Heard Island and McDonald Islands',
4212 'VA': 'Holy See (Vatican City State)',
4213 'HN': 'Honduras',
4214 'HK': 'Hong Kong',
4215 'HU': 'Hungary',
4216 'IS': 'Iceland',
4217 'IN': 'India',
4218 'ID': 'Indonesia',
4219 'IR': 'Iran, Islamic Republic of',
4220 'IQ': 'Iraq',
4221 'IE': 'Ireland',
4222 'IM': 'Isle of Man',
4223 'IL': 'Israel',
4224 'IT': 'Italy',
4225 'JM': 'Jamaica',
4226 'JP': 'Japan',
4227 'JE': 'Jersey',
4228 'JO': 'Jordan',
4229 'KZ': 'Kazakhstan',
4230 'KE': 'Kenya',
4231 'KI': 'Kiribati',
4232 'KP': 'Korea, Democratic People\'s Republic of',
4233 'KR': 'Korea, Republic of',
4234 'KW': 'Kuwait',
4235 'KG': 'Kyrgyzstan',
4236 'LA': 'Lao People\'s Democratic Republic',
4237 'LV': 'Latvia',
4238 'LB': 'Lebanon',
4239 'LS': 'Lesotho',
4240 'LR': 'Liberia',
4241 'LY': 'Libya',
4242 'LI': 'Liechtenstein',
4243 'LT': 'Lithuania',
4244 'LU': 'Luxembourg',
4245 'MO': 'Macao',
4246 'MK': 'Macedonia, the Former Yugoslav Republic of',
4247 'MG': 'Madagascar',
4248 'MW': 'Malawi',
4249 'MY': 'Malaysia',
4250 'MV': 'Maldives',
4251 'ML': 'Mali',
4252 'MT': 'Malta',
4253 'MH': 'Marshall Islands',
4254 'MQ': 'Martinique',
4255 'MR': 'Mauritania',
4256 'MU': 'Mauritius',
4257 'YT': 'Mayotte',
4258 'MX': 'Mexico',
4259 'FM': 'Micronesia, Federated States of',
4260 'MD': 'Moldova, Republic of',
4261 'MC': 'Monaco',
4262 'MN': 'Mongolia',
4263 'ME': 'Montenegro',
4264 'MS': 'Montserrat',
4265 'MA': 'Morocco',
4266 'MZ': 'Mozambique',
4267 'MM': 'Myanmar',
4268 'NA': 'Namibia',
4269 'NR': 'Nauru',
4270 'NP': 'Nepal',
4271 'NL': 'Netherlands',
4272 'NC': 'New Caledonia',
4273 'NZ': 'New Zealand',
4274 'NI': 'Nicaragua',
4275 'NE': 'Niger',
4276 'NG': 'Nigeria',
4277 'NU': 'Niue',
4278 'NF': 'Norfolk Island',
4279 'MP': 'Northern Mariana Islands',
4280 'NO': 'Norway',
4281 'OM': 'Oman',
4282 'PK': 'Pakistan',
4283 'PW': 'Palau',
4284 'PS': 'Palestine, State of',
4285 'PA': 'Panama',
4286 'PG': 'Papua New Guinea',
4287 'PY': 'Paraguay',
4288 'PE': 'Peru',
4289 'PH': 'Philippines',
4290 'PN': 'Pitcairn',
4291 'PL': 'Poland',
4292 'PT': 'Portugal',
4293 'PR': 'Puerto Rico',
4294 'QA': 'Qatar',
4295 'RE': 'Réunion',
4296 'RO': 'Romania',
4297 'RU': 'Russian Federation',
4298 'RW': 'Rwanda',
4299 'BL': 'Saint Barthélemy',
4300 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4301 'KN': 'Saint Kitts and Nevis',
4302 'LC': 'Saint Lucia',
4303 'MF': 'Saint Martin (French part)',
4304 'PM': 'Saint Pierre and Miquelon',
4305 'VC': 'Saint Vincent and the Grenadines',
4306 'WS': 'Samoa',
4307 'SM': 'San Marino',
4308 'ST': 'Sao Tome and Principe',
4309 'SA': 'Saudi Arabia',
4310 'SN': 'Senegal',
4311 'RS': 'Serbia',
4312 'SC': 'Seychelles',
4313 'SL': 'Sierra Leone',
4314 'SG': 'Singapore',
4315 'SX': 'Sint Maarten (Dutch part)',
4316 'SK': 'Slovakia',
4317 'SI': 'Slovenia',
4318 'SB': 'Solomon Islands',
4319 'SO': 'Somalia',
4320 'ZA': 'South Africa',
4321 'GS': 'South Georgia and the South Sandwich Islands',
4322 'SS': 'South Sudan',
4323 'ES': 'Spain',
4324 'LK': 'Sri Lanka',
4325 'SD': 'Sudan',
4326 'SR': 'Suriname',
4327 'SJ': 'Svalbard and Jan Mayen',
4328 'SZ': 'Swaziland',
4329 'SE': 'Sweden',
4330 'CH': 'Switzerland',
4331 'SY': 'Syrian Arab Republic',
4332 'TW': 'Taiwan, Province of China',
4333 'TJ': 'Tajikistan',
4334 'TZ': 'Tanzania, United Republic of',
4335 'TH': 'Thailand',
4336 'TL': 'Timor-Leste',
4337 'TG': 'Togo',
4338 'TK': 'Tokelau',
4339 'TO': 'Tonga',
4340 'TT': 'Trinidad and Tobago',
4341 'TN': 'Tunisia',
4342 'TR': 'Turkey',
4343 'TM': 'Turkmenistan',
4344 'TC': 'Turks and Caicos Islands',
4345 'TV': 'Tuvalu',
4346 'UG': 'Uganda',
4347 'UA': 'Ukraine',
4348 'AE': 'United Arab Emirates',
4349 'GB': 'United Kingdom',
4350 'US': 'United States',
4351 'UM': 'United States Minor Outlying Islands',
4352 'UY': 'Uruguay',
4353 'UZ': 'Uzbekistan',
4354 'VU': 'Vanuatu',
4355 'VE': 'Venezuela, Bolivarian Republic of',
4356 'VN': 'Viet Nam',
4357 'VG': 'Virgin Islands, British',
4358 'VI': 'Virgin Islands, U.S.',
4359 'WF': 'Wallis and Futuna',
4360 'EH': 'Western Sahara',
4361 'YE': 'Yemen',
4362 'ZM': 'Zambia',
4363 'ZW': 'Zimbabwe',
4364 # Not ISO 3166 codes, but used for IP blocks
4365 'AP': 'Asia/Pacific Region',
4366 'EU': 'Europe',
4367 }
4368
4369 @classmethod
4370 def short2full(cls, code):
4371 """Convert an ISO 3166-2 country code to the corresponding full name"""
4372 return cls._country_map.get(code.upper())
4373
4374
4375 class GeoUtils:
4376 # Major IPv4 address blocks per country
4377 _country_ip_map = {
4378 'AD': '46.172.224.0/19',
4379 'AE': '94.200.0.0/13',
4380 'AF': '149.54.0.0/17',
4381 'AG': '209.59.64.0/18',
4382 'AI': '204.14.248.0/21',
4383 'AL': '46.99.0.0/16',
4384 'AM': '46.70.0.0/15',
4385 'AO': '105.168.0.0/13',
4386 'AP': '182.50.184.0/21',
4387 'AQ': '23.154.160.0/24',
4388 'AR': '181.0.0.0/12',
4389 'AS': '202.70.112.0/20',
4390 'AT': '77.116.0.0/14',
4391 'AU': '1.128.0.0/11',
4392 'AW': '181.41.0.0/18',
4393 'AX': '185.217.4.0/22',
4394 'AZ': '5.197.0.0/16',
4395 'BA': '31.176.128.0/17',
4396 'BB': '65.48.128.0/17',
4397 'BD': '114.130.0.0/16',
4398 'BE': '57.0.0.0/8',
4399 'BF': '102.178.0.0/15',
4400 'BG': '95.42.0.0/15',
4401 'BH': '37.131.0.0/17',
4402 'BI': '154.117.192.0/18',
4403 'BJ': '137.255.0.0/16',
4404 'BL': '185.212.72.0/23',
4405 'BM': '196.12.64.0/18',
4406 'BN': '156.31.0.0/16',
4407 'BO': '161.56.0.0/16',
4408 'BQ': '161.0.80.0/20',
4409 'BR': '191.128.0.0/12',
4410 'BS': '24.51.64.0/18',
4411 'BT': '119.2.96.0/19',
4412 'BW': '168.167.0.0/16',
4413 'BY': '178.120.0.0/13',
4414 'BZ': '179.42.192.0/18',
4415 'CA': '99.224.0.0/11',
4416 'CD': '41.243.0.0/16',
4417 'CF': '197.242.176.0/21',
4418 'CG': '160.113.0.0/16',
4419 'CH': '85.0.0.0/13',
4420 'CI': '102.136.0.0/14',
4421 'CK': '202.65.32.0/19',
4422 'CL': '152.172.0.0/14',
4423 'CM': '102.244.0.0/14',
4424 'CN': '36.128.0.0/10',
4425 'CO': '181.240.0.0/12',
4426 'CR': '201.192.0.0/12',
4427 'CU': '152.206.0.0/15',
4428 'CV': '165.90.96.0/19',
4429 'CW': '190.88.128.0/17',
4430 'CY': '31.153.0.0/16',
4431 'CZ': '88.100.0.0/14',
4432 'DE': '53.0.0.0/8',
4433 'DJ': '197.241.0.0/17',
4434 'DK': '87.48.0.0/12',
4435 'DM': '192.243.48.0/20',
4436 'DO': '152.166.0.0/15',
4437 'DZ': '41.96.0.0/12',
4438 'EC': '186.68.0.0/15',
4439 'EE': '90.190.0.0/15',
4440 'EG': '156.160.0.0/11',
4441 'ER': '196.200.96.0/20',
4442 'ES': '88.0.0.0/11',
4443 'ET': '196.188.0.0/14',
4444 'EU': '2.16.0.0/13',
4445 'FI': '91.152.0.0/13',
4446 'FJ': '144.120.0.0/16',
4447 'FK': '80.73.208.0/21',
4448 'FM': '119.252.112.0/20',
4449 'FO': '88.85.32.0/19',
4450 'FR': '90.0.0.0/9',
4451 'GA': '41.158.0.0/15',
4452 'GB': '25.0.0.0/8',
4453 'GD': '74.122.88.0/21',
4454 'GE': '31.146.0.0/16',
4455 'GF': '161.22.64.0/18',
4456 'GG': '62.68.160.0/19',
4457 'GH': '154.160.0.0/12',
4458 'GI': '95.164.0.0/16',
4459 'GL': '88.83.0.0/19',
4460 'GM': '160.182.0.0/15',
4461 'GN': '197.149.192.0/18',
4462 'GP': '104.250.0.0/19',
4463 'GQ': '105.235.224.0/20',
4464 'GR': '94.64.0.0/13',
4465 'GT': '168.234.0.0/16',
4466 'GU': '168.123.0.0/16',
4467 'GW': '197.214.80.0/20',
4468 'GY': '181.41.64.0/18',
4469 'HK': '113.252.0.0/14',
4470 'HN': '181.210.0.0/16',
4471 'HR': '93.136.0.0/13',
4472 'HT': '148.102.128.0/17',
4473 'HU': '84.0.0.0/14',
4474 'ID': '39.192.0.0/10',
4475 'IE': '87.32.0.0/12',
4476 'IL': '79.176.0.0/13',
4477 'IM': '5.62.80.0/20',
4478 'IN': '117.192.0.0/10',
4479 'IO': '203.83.48.0/21',
4480 'IQ': '37.236.0.0/14',
4481 'IR': '2.176.0.0/12',
4482 'IS': '82.221.0.0/16',
4483 'IT': '79.0.0.0/10',
4484 'JE': '87.244.64.0/18',
4485 'JM': '72.27.0.0/17',
4486 'JO': '176.29.0.0/16',
4487 'JP': '133.0.0.0/8',
4488 'KE': '105.48.0.0/12',
4489 'KG': '158.181.128.0/17',
4490 'KH': '36.37.128.0/17',
4491 'KI': '103.25.140.0/22',
4492 'KM': '197.255.224.0/20',
4493 'KN': '198.167.192.0/19',
4494 'KP': '175.45.176.0/22',
4495 'KR': '175.192.0.0/10',
4496 'KW': '37.36.0.0/14',
4497 'KY': '64.96.0.0/15',
4498 'KZ': '2.72.0.0/13',
4499 'LA': '115.84.64.0/18',
4500 'LB': '178.135.0.0/16',
4501 'LC': '24.92.144.0/20',
4502 'LI': '82.117.0.0/19',
4503 'LK': '112.134.0.0/15',
4504 'LR': '102.183.0.0/16',
4505 'LS': '129.232.0.0/17',
4506 'LT': '78.56.0.0/13',
4507 'LU': '188.42.0.0/16',
4508 'LV': '46.109.0.0/16',
4509 'LY': '41.252.0.0/14',
4510 'MA': '105.128.0.0/11',
4511 'MC': '88.209.64.0/18',
4512 'MD': '37.246.0.0/16',
4513 'ME': '178.175.0.0/17',
4514 'MF': '74.112.232.0/21',
4515 'MG': '154.126.0.0/17',
4516 'MH': '117.103.88.0/21',
4517 'MK': '77.28.0.0/15',
4518 'ML': '154.118.128.0/18',
4519 'MM': '37.111.0.0/17',
4520 'MN': '49.0.128.0/17',
4521 'MO': '60.246.0.0/16',
4522 'MP': '202.88.64.0/20',
4523 'MQ': '109.203.224.0/19',
4524 'MR': '41.188.64.0/18',
4525 'MS': '208.90.112.0/22',
4526 'MT': '46.11.0.0/16',
4527 'MU': '105.16.0.0/12',
4528 'MV': '27.114.128.0/18',
4529 'MW': '102.70.0.0/15',
4530 'MX': '187.192.0.0/11',
4531 'MY': '175.136.0.0/13',
4532 'MZ': '197.218.0.0/15',
4533 'NA': '41.182.0.0/16',
4534 'NC': '101.101.0.0/18',
4535 'NE': '197.214.0.0/18',
4536 'NF': '203.17.240.0/22',
4537 'NG': '105.112.0.0/12',
4538 'NI': '186.76.0.0/15',
4539 'NL': '145.96.0.0/11',
4540 'NO': '84.208.0.0/13',
4541 'NP': '36.252.0.0/15',
4542 'NR': '203.98.224.0/19',
4543 'NU': '49.156.48.0/22',
4544 'NZ': '49.224.0.0/14',
4545 'OM': '5.36.0.0/15',
4546 'PA': '186.72.0.0/15',
4547 'PE': '186.160.0.0/14',
4548 'PF': '123.50.64.0/18',
4549 'PG': '124.240.192.0/19',
4550 'PH': '49.144.0.0/13',
4551 'PK': '39.32.0.0/11',
4552 'PL': '83.0.0.0/11',
4553 'PM': '70.36.0.0/20',
4554 'PR': '66.50.0.0/16',
4555 'PS': '188.161.0.0/16',
4556 'PT': '85.240.0.0/13',
4557 'PW': '202.124.224.0/20',
4558 'PY': '181.120.0.0/14',
4559 'QA': '37.210.0.0/15',
4560 'RE': '102.35.0.0/16',
4561 'RO': '79.112.0.0/13',
4562 'RS': '93.86.0.0/15',
4563 'RU': '5.136.0.0/13',
4564 'RW': '41.186.0.0/16',
4565 'SA': '188.48.0.0/13',
4566 'SB': '202.1.160.0/19',
4567 'SC': '154.192.0.0/11',
4568 'SD': '102.120.0.0/13',
4569 'SE': '78.64.0.0/12',
4570 'SG': '8.128.0.0/10',
4571 'SI': '188.196.0.0/14',
4572 'SK': '78.98.0.0/15',
4573 'SL': '102.143.0.0/17',
4574 'SM': '89.186.32.0/19',
4575 'SN': '41.82.0.0/15',
4576 'SO': '154.115.192.0/18',
4577 'SR': '186.179.128.0/17',
4578 'SS': '105.235.208.0/21',
4579 'ST': '197.159.160.0/19',
4580 'SV': '168.243.0.0/16',
4581 'SX': '190.102.0.0/20',
4582 'SY': '5.0.0.0/16',
4583 'SZ': '41.84.224.0/19',
4584 'TC': '65.255.48.0/20',
4585 'TD': '154.68.128.0/19',
4586 'TG': '196.168.0.0/14',
4587 'TH': '171.96.0.0/13',
4588 'TJ': '85.9.128.0/18',
4589 'TK': '27.96.24.0/21',
4590 'TL': '180.189.160.0/20',
4591 'TM': '95.85.96.0/19',
4592 'TN': '197.0.0.0/11',
4593 'TO': '175.176.144.0/21',
4594 'TR': '78.160.0.0/11',
4595 'TT': '186.44.0.0/15',
4596 'TV': '202.2.96.0/19',
4597 'TW': '120.96.0.0/11',
4598 'TZ': '156.156.0.0/14',
4599 'UA': '37.52.0.0/14',
4600 'UG': '102.80.0.0/13',
4601 'US': '6.0.0.0/8',
4602 'UY': '167.56.0.0/13',
4603 'UZ': '84.54.64.0/18',
4604 'VA': '212.77.0.0/19',
4605 'VC': '207.191.240.0/21',
4606 'VE': '186.88.0.0/13',
4607 'VG': '66.81.192.0/20',
4608 'VI': '146.226.0.0/16',
4609 'VN': '14.160.0.0/11',
4610 'VU': '202.80.32.0/20',
4611 'WF': '117.20.32.0/21',
4612 'WS': '202.4.32.0/19',
4613 'YE': '134.35.0.0/16',
4614 'YT': '41.242.116.0/22',
4615 'ZA': '41.0.0.0/11',
4616 'ZM': '102.144.0.0/13',
4617 'ZW': '102.177.192.0/18',
4618 }
4619
4620 @classmethod
4621 def random_ipv4(cls, code_or_block):
4622 if len(code_or_block) == 2:
4623 block = cls._country_ip_map.get(code_or_block.upper())
4624 if not block:
4625 return None
4626 else:
4627 block = code_or_block
4628 addr, preflen = block.split('/')
4629 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4630 addr_max = addr_min | (0xffffffff >> int(preflen))
4631 return compat_str(socket.inet_ntoa(
4632 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4633
4634
4635 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4636 def __init__(self, proxies=None):
4637 # Set default handlers
4638 for type in ('http', 'https'):
4639 setattr(self, '%s_open' % type,
4640 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4641 meth(r, proxy, type))
4642 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4643
4644 def proxy_open(self, req, proxy, type):
4645 req_proxy = req.headers.get('Ytdl-request-proxy')
4646 if req_proxy is not None:
4647 proxy = req_proxy
4648 del req.headers['Ytdl-request-proxy']
4649
4650 if proxy == '__noproxy__':
4651 return None # No Proxy
4652 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4653 req.add_header('Ytdl-socks-proxy', proxy)
4654 # yt-dlp's http/https handlers do wrapping the socket with socks
4655 return None
4656 return compat_urllib_request.ProxyHandler.proxy_open(
4657 self, req, proxy, type)
4658
4659
4660 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4661 # released into Public Domain
4662 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4663
4664 def long_to_bytes(n, blocksize=0):
4665 """long_to_bytes(n:long, blocksize:int) : string
4666 Convert a long integer to a byte string.
4667
4668 If optional blocksize is given and greater than zero, pad the front of the
4669 byte string with binary zeros so that the length is a multiple of
4670 blocksize.
4671 """
4672 # after much testing, this algorithm was deemed to be the fastest
4673 s = b''
4674 n = int(n)
4675 while n > 0:
4676 s = compat_struct_pack('>I', n & 0xffffffff) + s
4677 n = n >> 32
4678 # strip off leading zeros
4679 for i in range(len(s)):
4680 if s[i] != b'\000'[0]:
4681 break
4682 else:
4683 # only happens when n == 0
4684 s = b'\000'
4685 i = 0
4686 s = s[i:]
4687 # add back some pad bytes. this could be done more efficiently w.r.t. the
4688 # de-padding being done above, but sigh...
4689 if blocksize > 0 and len(s) % blocksize:
4690 s = (blocksize - len(s) % blocksize) * b'\000' + s
4691 return s
4692
4693
4694 def bytes_to_long(s):
4695 """bytes_to_long(string) : long
4696 Convert a byte string to a long integer.
4697
4698 This is (essentially) the inverse of long_to_bytes().
4699 """
4700 acc = 0
4701 length = len(s)
4702 if length % 4:
4703 extra = (4 - length % 4)
4704 s = b'\000' * extra + s
4705 length = length + extra
4706 for i in range(0, length, 4):
4707 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4708 return acc
4709
4710
4711 def ohdave_rsa_encrypt(data, exponent, modulus):
4712 '''
4713 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4714
4715 Input:
4716 data: data to encrypt, bytes-like object
4717 exponent, modulus: parameter e and N of RSA algorithm, both integer
4718 Output: hex string of encrypted data
4719
4720 Limitation: supports one block encryption only
4721 '''
4722
4723 payload = int(binascii.hexlify(data[::-1]), 16)
4724 encrypted = pow(payload, exponent, modulus)
4725 return '%x' % encrypted
4726
4727
4728 def pkcs1pad(data, length):
4729 """
4730 Padding input data with PKCS#1 scheme
4731
4732 @param {int[]} data input data
4733 @param {int} length target length
4734 @returns {int[]} padded data
4735 """
4736 if len(data) > length - 11:
4737 raise ValueError('Input data too long for PKCS#1 padding')
4738
4739 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4740 return [0, 2] + pseudo_random + [0] + data
4741
4742
4743 def encode_base_n(num, n, table=None):
4744 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4745 if not table:
4746 table = FULL_TABLE[:n]
4747
4748 if n > len(table):
4749 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4750
4751 if num == 0:
4752 return table[0]
4753
4754 ret = ''
4755 while num:
4756 ret = table[num % n] + ret
4757 num = num // n
4758 return ret
4759
4760
4761 def decode_packed_codes(code):
4762 mobj = re.search(PACKED_CODES_RE, code)
4763 obfuscated_code, base, count, symbols = mobj.groups()
4764 base = int(base)
4765 count = int(count)
4766 symbols = symbols.split('|')
4767 symbol_table = {}
4768
4769 while count:
4770 count -= 1
4771 base_n_count = encode_base_n(count, base)
4772 symbol_table[base_n_count] = symbols[count] or base_n_count
4773
4774 return re.sub(
4775 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4776 obfuscated_code)
4777
4778
4779 def caesar(s, alphabet, shift):
4780 if shift == 0:
4781 return s
4782 l = len(alphabet)
4783 return ''.join(
4784 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4785 for c in s)
4786
4787
4788 def rot47(s):
4789 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4790
4791
4792 def parse_m3u8_attributes(attrib):
4793 info = {}
4794 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4795 if val.startswith('"'):
4796 val = val[1:-1]
4797 info[key] = val
4798 return info
4799
4800
4801 def urshift(val, n):
4802 return val >> n if val >= 0 else (val + 0x100000000) >> n
4803
4804
4805 # Based on png2str() written by @gdkchan and improved by @yokrysty
4806 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4807 def decode_png(png_data):
4808 # Reference: https://www.w3.org/TR/PNG/
4809 header = png_data[8:]
4810
4811 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4812 raise OSError('Not a valid PNG file.')
4813
4814 int_map = {1: '>B', 2: '>H', 4: '>I'}
4815 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4816
4817 chunks = []
4818
4819 while header:
4820 length = unpack_integer(header[:4])
4821 header = header[4:]
4822
4823 chunk_type = header[:4]
4824 header = header[4:]
4825
4826 chunk_data = header[:length]
4827 header = header[length:]
4828
4829 header = header[4:] # Skip CRC
4830
4831 chunks.append({
4832 'type': chunk_type,
4833 'length': length,
4834 'data': chunk_data
4835 })
4836
4837 ihdr = chunks[0]['data']
4838
4839 width = unpack_integer(ihdr[:4])
4840 height = unpack_integer(ihdr[4:8])
4841
4842 idat = b''
4843
4844 for chunk in chunks:
4845 if chunk['type'] == b'IDAT':
4846 idat += chunk['data']
4847
4848 if not idat:
4849 raise OSError('Unable to read PNG data.')
4850
4851 decompressed_data = bytearray(zlib.decompress(idat))
4852
4853 stride = width * 3
4854 pixels = []
4855
4856 def _get_pixel(idx):
4857 x = idx % stride
4858 y = idx // stride
4859 return pixels[y][x]
4860
4861 for y in range(height):
4862 basePos = y * (1 + stride)
4863 filter_type = decompressed_data[basePos]
4864
4865 current_row = []
4866
4867 pixels.append(current_row)
4868
4869 for x in range(stride):
4870 color = decompressed_data[1 + basePos + x]
4871 basex = y * stride + x
4872 left = 0
4873 up = 0
4874
4875 if x > 2:
4876 left = _get_pixel(basex - 3)
4877 if y > 0:
4878 up = _get_pixel(basex - stride)
4879
4880 if filter_type == 1: # Sub
4881 color = (color + left) & 0xff
4882 elif filter_type == 2: # Up
4883 color = (color + up) & 0xff
4884 elif filter_type == 3: # Average
4885 color = (color + ((left + up) >> 1)) & 0xff
4886 elif filter_type == 4: # Paeth
4887 a = left
4888 b = up
4889 c = 0
4890
4891 if x > 2 and y > 0:
4892 c = _get_pixel(basex - stride - 3)
4893
4894 p = a + b - c
4895
4896 pa = abs(p - a)
4897 pb = abs(p - b)
4898 pc = abs(p - c)
4899
4900 if pa <= pb and pa <= pc:
4901 color = (color + a) & 0xff
4902 elif pb <= pc:
4903 color = (color + b) & 0xff
4904 else:
4905 color = (color + c) & 0xff
4906
4907 current_row.append(color)
4908
4909 return width, height, pixels
4910
4911
4912 def write_xattr(path, key, value):
4913 # Windows: Write xattrs to NTFS Alternate Data Streams:
4914 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4915 if compat_os_name == 'nt':
4916 assert ':' not in key
4917 assert os.path.exists(path)
4918
4919 try:
4920 with open(f'{path}:{key}', 'wb') as f:
4921 f.write(value)
4922 except OSError as e:
4923 raise XAttrMetadataError(e.errno, e.strerror)
4924 return
4925
4926 # UNIX Method 1. Use xattrs/pyxattrs modules
4927 from .dependencies import xattr
4928
4929 setxattr = None
4930 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4931 # Unicode arguments are not supported in pyxattr until version 0.5.0
4932 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4933 if version_tuple(xattr.__version__) >= (0, 5, 0):
4934 setxattr = xattr.set
4935 elif xattr:
4936 setxattr = xattr.setxattr
4937
4938 if setxattr:
4939 try:
4940 setxattr(path, key, value)
4941 except OSError as e:
4942 raise XAttrMetadataError(e.errno, e.strerror)
4943 return
4944
4945 # UNIX Method 2. Use setfattr/xattr executables
4946 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4947 else 'xattr' if check_executable('xattr', ['-h']) else None)
4948 if not exe:
4949 raise XAttrUnavailableError(
4950 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4951 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
4952
4953 value = value.decode()
4954 try:
4955 _, stderr, returncode = Popen.run(
4956 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4957 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4958 except OSError as e:
4959 raise XAttrMetadataError(e.errno, e.strerror)
4960 if returncode:
4961 raise XAttrMetadataError(returncode, stderr)
4962
4963
4964 def random_birthday(year_field, month_field, day_field):
4965 start_date = datetime.date(1950, 1, 1)
4966 end_date = datetime.date(1995, 12, 31)
4967 offset = random.randint(0, (end_date - start_date).days)
4968 random_date = start_date + datetime.timedelta(offset)
4969 return {
4970 year_field: str(random_date.year),
4971 month_field: str(random_date.month),
4972 day_field: str(random_date.day),
4973 }
4974
4975
4976 # Templates for internet shortcut files, which are plain text files.
4977 DOT_URL_LINK_TEMPLATE = '''\
4978 [InternetShortcut]
4979 URL=%(url)s
4980 '''
4981
4982 DOT_WEBLOC_LINK_TEMPLATE = '''\
4983 <?xml version="1.0" encoding="UTF-8"?>
4984 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4985 <plist version="1.0">
4986 <dict>
4987 \t<key>URL</key>
4988 \t<string>%(url)s</string>
4989 </dict>
4990 </plist>
4991 '''
4992
4993 DOT_DESKTOP_LINK_TEMPLATE = '''\
4994 [Desktop Entry]
4995 Encoding=UTF-8
4996 Name=%(filename)s
4997 Type=Link
4998 URL=%(url)s
4999 Icon=text-html
5000 '''
5001
5002 LINK_TEMPLATES = {
5003 'url': DOT_URL_LINK_TEMPLATE,
5004 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5005 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5006 }
5007
5008
5009 def iri_to_uri(iri):
5010 """
5011 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5012
5013 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5014 """
5015
5016 iri_parts = compat_urllib_parse_urlparse(iri)
5017
5018 if '[' in iri_parts.netloc:
5019 raise ValueError('IPv6 URIs are not, yet, supported.')
5020 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5021
5022 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5023
5024 net_location = ''
5025 if iri_parts.username:
5026 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5027 if iri_parts.password is not None:
5028 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5029 net_location += '@'
5030
5031 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5032 # The 'idna' encoding produces ASCII text.
5033 if iri_parts.port is not None and iri_parts.port != 80:
5034 net_location += ':' + str(iri_parts.port)
5035
5036 return urllib.parse.urlunparse(
5037 (iri_parts.scheme,
5038 net_location,
5039
5040 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5041
5042 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5043 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5044
5045 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5046 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5047
5048 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5049
5050 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5051
5052
5053 def to_high_limit_path(path):
5054 if sys.platform in ['win32', 'cygwin']:
5055 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5056 return '\\\\?\\' + os.path.abspath(path)
5057
5058 return path
5059
5060
5061 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=None):
5062 val = traverse_obj(obj, *variadic(field))
5063 if (not val and val != 0) if ignore is NO_DEFAULT else val in ignore:
5064 return default
5065 return template % (func(val) if func else val)
5066
5067
5068 def clean_podcast_url(url):
5069 return re.sub(r'''(?x)
5070 (?:
5071 (?:
5072 chtbl\.com/track|
5073 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5074 play\.podtrac\.com
5075 )/[^/]+|
5076 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5077 flex\.acast\.com|
5078 pd(?:
5079 cn\.co| # https://podcorn.com/analytics-prefix/
5080 st\.fm # https://podsights.com/docs/
5081 )/e
5082 )/''', '', url)
5083
5084
5085 _HEX_TABLE = '0123456789abcdef'
5086
5087
5088 def random_uuidv4():
5089 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5090
5091
5092 def make_dir(path, to_screen=None):
5093 try:
5094 dn = os.path.dirname(path)
5095 if dn and not os.path.exists(dn):
5096 os.makedirs(dn)
5097 return True
5098 except OSError as err:
5099 if callable(to_screen) is not None:
5100 to_screen('unable to create directory ' + error_to_compat_str(err))
5101 return False
5102
5103
5104 def get_executable_path():
5105 from .update import _get_variant_and_executable_path
5106
5107 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5108
5109
5110 def load_plugins(name, suffix, namespace):
5111 classes = {}
5112 with contextlib.suppress(FileNotFoundError):
5113 plugins_spec = importlib.util.spec_from_file_location(
5114 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5115 plugins = importlib.util.module_from_spec(plugins_spec)
5116 sys.modules[plugins_spec.name] = plugins
5117 plugins_spec.loader.exec_module(plugins)
5118 for name in dir(plugins):
5119 if name in namespace:
5120 continue
5121 if not name.endswith(suffix):
5122 continue
5123 klass = getattr(plugins, name)
5124 classes[name] = namespace[name] = klass
5125 return classes
5126
5127
5128 def traverse_obj(
5129 obj, *path_list, default=None, expected_type=None, get_all=True,
5130 casesense=True, is_user_input=False, traverse_string=False):
5131 ''' Traverse nested list/dict/tuple
5132 @param path_list A list of paths which are checked one by one.
5133 Each path is a list of keys where each key is a:
5134 - None: Do nothing
5135 - string: A dictionary key
5136 - int: An index into a list
5137 - tuple: A list of keys all of which will be traversed
5138 - Ellipsis: Fetch all values in the object
5139 - Function: Takes the key and value as arguments
5140 and returns whether the key matches or not
5141 @param default Default value to return
5142 @param expected_type Only accept final value of this type (Can also be any callable)
5143 @param get_all Return all the values obtained from a path or only the first one
5144 @param casesense Whether to consider dictionary keys as case sensitive
5145 @param is_user_input Whether the keys are generated from user input. If True,
5146 strings are converted to int/slice if necessary
5147 @param traverse_string Whether to traverse inside strings. If True, any
5148 non-compatible object will also be converted into a string
5149 # TODO: Write tests
5150 '''
5151 if not casesense:
5152 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5153 path_list = (map(_lower, variadic(path)) for path in path_list)
5154
5155 def _traverse_obj(obj, path, _current_depth=0):
5156 nonlocal depth
5157 path = tuple(variadic(path))
5158 for i, key in enumerate(path):
5159 if None in (key, obj):
5160 return obj
5161 if isinstance(key, (list, tuple)):
5162 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5163 key = ...
5164 if key is ...:
5165 obj = (obj.values() if isinstance(obj, dict)
5166 else obj if isinstance(obj, (list, tuple, LazyList))
5167 else str(obj) if traverse_string else [])
5168 _current_depth += 1
5169 depth = max(depth, _current_depth)
5170 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5171 elif callable(key):
5172 if isinstance(obj, (list, tuple, LazyList)):
5173 obj = enumerate(obj)
5174 elif isinstance(obj, dict):
5175 obj = obj.items()
5176 else:
5177 if not traverse_string:
5178 return None
5179 obj = str(obj)
5180 _current_depth += 1
5181 depth = max(depth, _current_depth)
5182 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5183 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5184 obj = (obj.get(key) if casesense or (key in obj)
5185 else next((v for k, v in obj.items() if _lower(k) == key), None))
5186 else:
5187 if is_user_input:
5188 key = (int_or_none(key) if ':' not in key
5189 else slice(*map(int_or_none, key.split(':'))))
5190 if key == slice(None):
5191 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5192 if not isinstance(key, (int, slice)):
5193 return None
5194 if not isinstance(obj, (list, tuple, LazyList)):
5195 if not traverse_string:
5196 return None
5197 obj = str(obj)
5198 try:
5199 obj = obj[key]
5200 except IndexError:
5201 return None
5202 return obj
5203
5204 if isinstance(expected_type, type):
5205 type_test = lambda val: val if isinstance(val, expected_type) else None
5206 elif expected_type is not None:
5207 type_test = expected_type
5208 else:
5209 type_test = lambda val: val
5210
5211 for path in path_list:
5212 depth = 0
5213 val = _traverse_obj(obj, path)
5214 if val is not None:
5215 if depth:
5216 for _ in range(depth - 1):
5217 val = itertools.chain.from_iterable(v for v in val if v is not None)
5218 val = [v for v in map(type_test, val) if v is not None]
5219 if val:
5220 return val if get_all else val[0]
5221 else:
5222 val = type_test(val)
5223 if val is not None:
5224 return val
5225 return default
5226
5227
5228 def traverse_dict(dictn, keys, casesense=True):
5229 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5230 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5231 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5232
5233
5234 def get_first(obj, keys, **kwargs):
5235 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5236
5237
5238 def variadic(x, allowed_types=(str, bytes, dict)):
5239 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5240
5241
5242 def decode_base(value, digits):
5243 # This will convert given base-x string to scalar (long or int)
5244 table = {char: index for index, char in enumerate(digits)}
5245 result = 0
5246 base = len(digits)
5247 for chr in value:
5248 result *= base
5249 result += table[chr]
5250 return result
5251
5252
5253 def time_seconds(**kwargs):
5254 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5255 return t.timestamp()
5256
5257
5258 # create a JSON Web Signature (jws) with HS256 algorithm
5259 # the resulting format is in JWS Compact Serialization
5260 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5261 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5262 def jwt_encode_hs256(payload_data, key, headers={}):
5263 header_data = {
5264 'alg': 'HS256',
5265 'typ': 'JWT',
5266 }
5267 if headers:
5268 header_data.update(headers)
5269 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5270 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5271 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5272 signature_b64 = base64.b64encode(h.digest())
5273 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5274 return token
5275
5276
5277 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5278 def jwt_decode_hs256(jwt):
5279 header_b64, payload_b64, signature_b64 = jwt.split('.')
5280 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5281 return payload_data
5282
5283
5284 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5285
5286
5287 @functools.cache
5288 def supports_terminal_sequences(stream):
5289 if compat_os_name == 'nt':
5290 if not WINDOWS_VT_MODE:
5291 return False
5292 elif not os.getenv('TERM'):
5293 return False
5294 try:
5295 return stream.isatty()
5296 except BaseException:
5297 return False
5298
5299
5300 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5301 if get_windows_version() < (10, 0, 10586):
5302 return
5303 global WINDOWS_VT_MODE
5304 try:
5305 Popen.run('', shell=True)
5306 except Exception:
5307 return
5308
5309 WINDOWS_VT_MODE = True
5310 supports_terminal_sequences.cache_clear()
5311
5312
5313 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5314
5315
5316 def remove_terminal_sequences(string):
5317 return _terminal_sequences_re.sub('', string)
5318
5319
5320 def number_of_digits(number):
5321 return len('%d' % number)
5322
5323
5324 def join_nonempty(*values, delim='-', from_dict=None):
5325 if from_dict is not None:
5326 values = map(from_dict.get, values)
5327 return delim.join(map(str, filter(None, values)))
5328
5329
5330 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5331 """
5332 Find the largest format dimensions in terms of video width and, for each thumbnail:
5333 * Modify the URL: Match the width with the provided regex and replace with the former width
5334 * Update dimensions
5335
5336 This function is useful with video services that scale the provided thumbnails on demand
5337 """
5338 _keys = ('width', 'height')
5339 max_dimensions = max(
5340 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5341 default=(0, 0))
5342 if not max_dimensions[0]:
5343 return thumbnails
5344 return [
5345 merge_dicts(
5346 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5347 dict(zip(_keys, max_dimensions)), thumbnail)
5348 for thumbnail in thumbnails
5349 ]
5350
5351
5352 def parse_http_range(range):
5353 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5354 if not range:
5355 return None, None, None
5356 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5357 if not crg:
5358 return None, None, None
5359 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5360
5361
5362 def read_stdin(what):
5363 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5364 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5365 return sys.stdin
5366
5367
5368 class Config:
5369 own_args = None
5370 parsed_args = None
5371 filename = None
5372 __initialized = False
5373
5374 def __init__(self, parser, label=None):
5375 self.parser, self.label = parser, label
5376 self._loaded_paths, self.configs = set(), []
5377
5378 def init(self, args=None, filename=None):
5379 assert not self.__initialized
5380 directory = ''
5381 if filename:
5382 location = os.path.realpath(filename)
5383 directory = os.path.dirname(location)
5384 if location in self._loaded_paths:
5385 return False
5386 self._loaded_paths.add(location)
5387
5388 self.own_args, self.__initialized = args, True
5389 opts, _ = self.parser.parse_known_args(args)
5390 self.parsed_args, self.filename = args, filename
5391
5392 for location in opts.config_locations or []:
5393 if location == '-':
5394 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5395 continue
5396 location = os.path.join(directory, expand_path(location))
5397 if os.path.isdir(location):
5398 location = os.path.join(location, 'yt-dlp.conf')
5399 if not os.path.exists(location):
5400 self.parser.error(f'config location {location} does not exist')
5401 self.append_config(self.read_file(location), location)
5402 return True
5403
5404 def __str__(self):
5405 label = join_nonempty(
5406 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5407 delim=' ')
5408 return join_nonempty(
5409 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5410 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5411 delim='\n')
5412
5413 @staticmethod
5414 def read_file(filename, default=[]):
5415 try:
5416 optionf = open(filename)
5417 except OSError:
5418 return default # silently skip if file is not present
5419 try:
5420 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5421 contents = optionf.read()
5422 res = shlex.split(contents, comments=True)
5423 finally:
5424 optionf.close()
5425 return res
5426
5427 @staticmethod
5428 def hide_login_info(opts):
5429 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5430 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5431
5432 def _scrub_eq(o):
5433 m = eqre.match(o)
5434 if m:
5435 return m.group('key') + '=PRIVATE'
5436 else:
5437 return o
5438
5439 opts = list(map(_scrub_eq, opts))
5440 for idx, opt in enumerate(opts):
5441 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5442 opts[idx + 1] = 'PRIVATE'
5443 return opts
5444
5445 def append_config(self, *args, label=None):
5446 config = type(self)(self.parser, label)
5447 config._loaded_paths = self._loaded_paths
5448 if config.init(*args):
5449 self.configs.append(config)
5450
5451 @property
5452 def all_args(self):
5453 for config in reversed(self.configs):
5454 yield from config.all_args
5455 yield from self.parsed_args or []
5456
5457 def parse_known_args(self, **kwargs):
5458 return self.parser.parse_known_args(self.all_args, **kwargs)
5459
5460 def parse_args(self):
5461 return self.parser.parse_args(self.all_args)
5462
5463
5464 class WebSocketsWrapper():
5465 """Wraps websockets module to use in non-async scopes"""
5466 pool = None
5467
5468 def __init__(self, url, headers=None, connect=True):
5469 self.loop = asyncio.new_event_loop()
5470 # XXX: "loop" is deprecated
5471 self.conn = websockets.connect(
5472 url, extra_headers=headers, ping_interval=None,
5473 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5474 if connect:
5475 self.__enter__()
5476 atexit.register(self.__exit__, None, None, None)
5477
5478 def __enter__(self):
5479 if not self.pool:
5480 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5481 return self
5482
5483 def send(self, *args):
5484 self.run_with_loop(self.pool.send(*args), self.loop)
5485
5486 def recv(self, *args):
5487 return self.run_with_loop(self.pool.recv(*args), self.loop)
5488
5489 def __exit__(self, type, value, traceback):
5490 try:
5491 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5492 finally:
5493 self.loop.close()
5494 self._cancel_all_tasks(self.loop)
5495
5496 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5497 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5498 @staticmethod
5499 def run_with_loop(main, loop):
5500 if not asyncio.iscoroutine(main):
5501 raise ValueError(f'a coroutine was expected, got {main!r}')
5502
5503 try:
5504 return loop.run_until_complete(main)
5505 finally:
5506 loop.run_until_complete(loop.shutdown_asyncgens())
5507 if hasattr(loop, 'shutdown_default_executor'):
5508 loop.run_until_complete(loop.shutdown_default_executor())
5509
5510 @staticmethod
5511 def _cancel_all_tasks(loop):
5512 to_cancel = asyncio.all_tasks(loop)
5513
5514 if not to_cancel:
5515 return
5516
5517 for task in to_cancel:
5518 task.cancel()
5519
5520 # XXX: "loop" is removed in python 3.10+
5521 loop.run_until_complete(
5522 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5523
5524 for task in to_cancel:
5525 if task.cancelled():
5526 continue
5527 if task.exception() is not None:
5528 loop.call_exception_handler({
5529 'message': 'unhandled exception during asyncio.run() shutdown',
5530 'exception': task.exception(),
5531 'task': task,
5532 })
5533
5534
5535 def merge_headers(*dicts):
5536 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5537 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5538
5539
5540 class classproperty:
5541 """classmethod(property(func)) that works in py < 3.9"""
5542
5543 def __init__(self, func):
5544 functools.update_wrapper(self, func)
5545 self.func = func
5546
5547 def __get__(self, _, cls):
5548 return self.func(cls)
5549
5550
5551 class Namespace(types.SimpleNamespace):
5552 """Immutable namespace"""
5553
5554 def __iter__(self):
5555 return iter(self.__dict__.values())
5556
5557 @property
5558 def items_(self):
5559 return self.__dict__.items()
5560
5561
5562 # Deprecated
5563 has_certifi = bool(certifi)
5564 has_websockets = bool(websockets)