]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[cleanup Misc
[yt-dlp.git] / yt_dlp / utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import html.entities
18 import html.parser
19 import http.client
20 import http.cookiejar
21 import importlib.util
22 import inspect
23 import io
24 import itertools
25 import json
26 import locale
27 import math
28 import mimetypes
29 import operator
30 import os
31 import platform
32 import random
33 import re
34 import shlex
35 import socket
36 import ssl
37 import struct
38 import subprocess
39 import sys
40 import tempfile
41 import time
42 import traceback
43 import types
44 import unicodedata
45 import urllib.error
46 import urllib.parse
47 import urllib.request
48 import xml.etree.ElementTree
49 import zlib
50
51 from .compat import functools # isort: split
52 from .compat import (
53 compat_etree_fromstring,
54 compat_expanduser,
55 compat_HTMLParseError,
56 compat_os_name,
57 compat_shlex_quote,
58 )
59 from .dependencies import brotli, certifi, websockets, xattr
60 from .socks import ProxyType, sockssocket
61
62
63 def register_socks_protocols():
64 # "Register" SOCKS protocols
65 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
66 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
67 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
68 if scheme not in urllib.parse.uses_netloc:
69 urllib.parse.uses_netloc.append(scheme)
70
71
72 # This is not clearly defined otherwise
73 compiled_regex_type = type(re.compile(''))
74
75
76 def random_user_agent():
77 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
78 _CHROME_VERSIONS = (
79 '90.0.4430.212',
80 '90.0.4430.24',
81 '90.0.4430.70',
82 '90.0.4430.72',
83 '90.0.4430.85',
84 '90.0.4430.93',
85 '91.0.4472.101',
86 '91.0.4472.106',
87 '91.0.4472.114',
88 '91.0.4472.124',
89 '91.0.4472.164',
90 '91.0.4472.19',
91 '91.0.4472.77',
92 '92.0.4515.107',
93 '92.0.4515.115',
94 '92.0.4515.131',
95 '92.0.4515.159',
96 '92.0.4515.43',
97 '93.0.4556.0',
98 '93.0.4577.15',
99 '93.0.4577.63',
100 '93.0.4577.82',
101 '94.0.4606.41',
102 '94.0.4606.54',
103 '94.0.4606.61',
104 '94.0.4606.71',
105 '94.0.4606.81',
106 '94.0.4606.85',
107 '95.0.4638.17',
108 '95.0.4638.50',
109 '95.0.4638.54',
110 '95.0.4638.69',
111 '95.0.4638.74',
112 '96.0.4664.18',
113 '96.0.4664.45',
114 '96.0.4664.55',
115 '96.0.4664.93',
116 '97.0.4692.20',
117 )
118 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
119
120
121 SUPPORTED_ENCODINGS = [
122 'gzip', 'deflate'
123 ]
124 if brotli:
125 SUPPORTED_ENCODINGS.append('br')
126
127 std_headers = {
128 'User-Agent': random_user_agent(),
129 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
130 'Accept-Language': 'en-us,en;q=0.5',
131 'Sec-Fetch-Mode': 'navigate',
132 }
133
134
135 USER_AGENTS = {
136 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
137 }
138
139
140 NO_DEFAULT = object()
141 IDENTITY = lambda x: x
142
143 ENGLISH_MONTH_NAMES = [
144 'January', 'February', 'March', 'April', 'May', 'June',
145 'July', 'August', 'September', 'October', 'November', 'December']
146
147 MONTH_NAMES = {
148 'en': ENGLISH_MONTH_NAMES,
149 'fr': [
150 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
151 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
152 }
153
154 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
155 TIMEZONE_NAMES = {
156 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
157 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
158 'EST': -5, 'EDT': -4, # Eastern
159 'CST': -6, 'CDT': -5, # Central
160 'MST': -7, 'MDT': -6, # Mountain
161 'PST': -8, 'PDT': -7 # Pacific
162 }
163
164 # needed for sanitizing filenames in restricted mode
165 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
166 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
167 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
168
169 DATE_FORMATS = (
170 '%d %B %Y',
171 '%d %b %Y',
172 '%B %d %Y',
173 '%B %dst %Y',
174 '%B %dnd %Y',
175 '%B %drd %Y',
176 '%B %dth %Y',
177 '%b %d %Y',
178 '%b %dst %Y',
179 '%b %dnd %Y',
180 '%b %drd %Y',
181 '%b %dth %Y',
182 '%b %dst %Y %I:%M',
183 '%b %dnd %Y %I:%M',
184 '%b %drd %Y %I:%M',
185 '%b %dth %Y %I:%M',
186 '%Y %m %d',
187 '%Y-%m-%d',
188 '%Y.%m.%d.',
189 '%Y/%m/%d',
190 '%Y/%m/%d %H:%M',
191 '%Y/%m/%d %H:%M:%S',
192 '%Y%m%d%H%M',
193 '%Y%m%d%H%M%S',
194 '%Y%m%d',
195 '%Y-%m-%d %H:%M',
196 '%Y-%m-%d %H:%M:%S',
197 '%Y-%m-%d %H:%M:%S.%f',
198 '%Y-%m-%d %H:%M:%S:%f',
199 '%d.%m.%Y %H:%M',
200 '%d.%m.%Y %H.%M',
201 '%Y-%m-%dT%H:%M:%SZ',
202 '%Y-%m-%dT%H:%M:%S.%fZ',
203 '%Y-%m-%dT%H:%M:%S.%f0Z',
204 '%Y-%m-%dT%H:%M:%S',
205 '%Y-%m-%dT%H:%M:%S.%f',
206 '%Y-%m-%dT%H:%M',
207 '%b %d %Y at %H:%M',
208 '%b %d %Y at %H:%M:%S',
209 '%B %d %Y at %H:%M',
210 '%B %d %Y at %H:%M:%S',
211 '%H:%M %d-%b-%Y',
212 )
213
214 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
215 DATE_FORMATS_DAY_FIRST.extend([
216 '%d-%m-%Y',
217 '%d.%m.%Y',
218 '%d.%m.%y',
219 '%d/%m/%Y',
220 '%d/%m/%y',
221 '%d/%m/%Y %H:%M:%S',
222 '%d-%m-%Y %H:%M',
223 ])
224
225 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
226 DATE_FORMATS_MONTH_FIRST.extend([
227 '%m-%d-%Y',
228 '%m.%d.%Y',
229 '%m/%d/%Y',
230 '%m/%d/%y',
231 '%m/%d/%Y %H:%M:%S',
232 ])
233
234 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
235 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
236
237 NUMBER_RE = r'\d+(?:\.\d+)?'
238
239
240 @functools.cache
241 def preferredencoding():
242 """Get preferred encoding.
243
244 Returns the best encoding scheme for the system, based on
245 locale.getpreferredencoding() and some further tweaks.
246 """
247 try:
248 pref = locale.getpreferredencoding()
249 'TEST'.encode(pref)
250 except Exception:
251 pref = 'UTF-8'
252
253 return pref
254
255
256 def write_json_file(obj, fn):
257 """ Encode obj as JSON and write it to fn, atomically if possible """
258
259 tf = tempfile.NamedTemporaryFile(
260 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
261 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
262
263 try:
264 with tf:
265 json.dump(obj, tf, ensure_ascii=False)
266 if sys.platform == 'win32':
267 # Need to remove existing file on Windows, else os.rename raises
268 # WindowsError or FileExistsError.
269 with contextlib.suppress(OSError):
270 os.unlink(fn)
271 with contextlib.suppress(OSError):
272 mask = os.umask(0)
273 os.umask(mask)
274 os.chmod(tf.name, 0o666 & ~mask)
275 os.rename(tf.name, fn)
276 except Exception:
277 with contextlib.suppress(OSError):
278 os.remove(tf.name)
279 raise
280
281
282 def find_xpath_attr(node, xpath, key, val=None):
283 """ Find the xpath xpath[@key=val] """
284 assert re.match(r'^[a-zA-Z_-]+$', key)
285 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
286 return node.find(expr)
287
288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
289 # the namespace parameter
290
291
292 def xpath_with_ns(path, ns_map):
293 components = [c.split(':') for c in path.split('/')]
294 replaced = []
295 for c in components:
296 if len(c) == 1:
297 replaced.append(c[0])
298 else:
299 ns, tag = c
300 replaced.append('{%s}%s' % (ns_map[ns], tag))
301 return '/'.join(replaced)
302
303
304 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
305 def _find_xpath(xpath):
306 return node.find(xpath)
307
308 if isinstance(xpath, str):
309 n = _find_xpath(xpath)
310 else:
311 for xp in xpath:
312 n = _find_xpath(xp)
313 if n is not None:
314 break
315
316 if n is None:
317 if default is not NO_DEFAULT:
318 return default
319 elif fatal:
320 name = xpath if name is None else name
321 raise ExtractorError('Could not find XML element %s' % name)
322 else:
323 return None
324 return n
325
326
327 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
328 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
329 if n is None or n == default:
330 return n
331 if n.text is None:
332 if default is not NO_DEFAULT:
333 return default
334 elif fatal:
335 name = xpath if name is None else name
336 raise ExtractorError('Could not find XML element\'s text %s' % name)
337 else:
338 return None
339 return n.text
340
341
342 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
343 n = find_xpath_attr(node, xpath, key)
344 if n is None:
345 if default is not NO_DEFAULT:
346 return default
347 elif fatal:
348 name = f'{xpath}[@{key}]' if name is None else name
349 raise ExtractorError('Could not find XML attribute %s' % name)
350 else:
351 return None
352 return n.attrib[key]
353
354
355 def get_element_by_id(id, html, **kwargs):
356 """Return the content of the tag with the specified ID in the passed HTML document"""
357 return get_element_by_attribute('id', id, html, **kwargs)
358
359
360 def get_element_html_by_id(id, html, **kwargs):
361 """Return the html of the tag with the specified ID in the passed HTML document"""
362 return get_element_html_by_attribute('id', id, html, **kwargs)
363
364
365 def get_element_by_class(class_name, html):
366 """Return the content of the first tag with the specified class in the passed HTML document"""
367 retval = get_elements_by_class(class_name, html)
368 return retval[0] if retval else None
369
370
371 def get_element_html_by_class(class_name, html):
372 """Return the html of the first tag with the specified class in the passed HTML document"""
373 retval = get_elements_html_by_class(class_name, html)
374 return retval[0] if retval else None
375
376
377 def get_element_by_attribute(attribute, value, html, **kwargs):
378 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
379 return retval[0] if retval else None
380
381
382 def get_element_html_by_attribute(attribute, value, html, **kargs):
383 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
384 return retval[0] if retval else None
385
386
387 def get_elements_by_class(class_name, html, **kargs):
388 """Return the content of all tags with the specified class in the passed HTML document as a list"""
389 return get_elements_by_attribute(
390 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
391 html, escape_value=False)
392
393
394 def get_elements_html_by_class(class_name, html):
395 """Return the html of all tags with the specified class in the passed HTML document as a list"""
396 return get_elements_html_by_attribute(
397 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
398 html, escape_value=False)
399
400
401 def get_elements_by_attribute(*args, **kwargs):
402 """Return the content of the tag with the specified attribute in the passed HTML document"""
403 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
404
405
406 def get_elements_html_by_attribute(*args, **kwargs):
407 """Return the html of the tag with the specified attribute in the passed HTML document"""
408 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
412 """
413 Return the text (content) and the html (whole) of the tag with the specified
414 attribute in the passed HTML document
415 """
416
417 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
418
419 value = re.escape(value) if escape_value else value
420
421 partial_element_re = rf'''(?x)
422 <(?P<tag>{tag})
423 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
424 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
425 '''
426
427 for m in re.finditer(partial_element_re, html):
428 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
429
430 yield (
431 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
432 whole
433 )
434
435
436 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
437 """
438 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
439 closing tag for the first opening tag it has encountered, and can be used
440 as a context manager
441 """
442
443 class HTMLBreakOnClosingTagException(Exception):
444 pass
445
446 def __init__(self):
447 self.tagstack = collections.deque()
448 html.parser.HTMLParser.__init__(self)
449
450 def __enter__(self):
451 return self
452
453 def __exit__(self, *_):
454 self.close()
455
456 def close(self):
457 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
458 # so data remains buffered; we no longer have any interest in it, thus
459 # override this method to discard it
460 pass
461
462 def handle_starttag(self, tag, _):
463 self.tagstack.append(tag)
464
465 def handle_endtag(self, tag):
466 if not self.tagstack:
467 raise compat_HTMLParseError('no tags in the stack')
468 while self.tagstack:
469 inner_tag = self.tagstack.pop()
470 if inner_tag == tag:
471 break
472 else:
473 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
474 if not self.tagstack:
475 raise self.HTMLBreakOnClosingTagException()
476
477
478 def get_element_text_and_html_by_tag(tag, html):
479 """
480 For the first element with the specified tag in the passed HTML document
481 return its' content (text) and the whole element (html)
482 """
483 def find_or_raise(haystack, needle, exc):
484 try:
485 return haystack.index(needle)
486 except ValueError:
487 raise exc
488 closing_tag = f'</{tag}>'
489 whole_start = find_or_raise(
490 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
491 content_start = find_or_raise(
492 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
493 content_start += whole_start + 1
494 with HTMLBreakOnClosingTagParser() as parser:
495 parser.feed(html[whole_start:content_start])
496 if not parser.tagstack or parser.tagstack[0] != tag:
497 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
498 offset = content_start
499 while offset < len(html):
500 next_closing_tag_start = find_or_raise(
501 html[offset:], closing_tag,
502 compat_HTMLParseError(f'closing {tag} tag not found'))
503 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
504 try:
505 parser.feed(html[offset:offset + next_closing_tag_end])
506 offset += next_closing_tag_end
507 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
508 return html[content_start:offset + next_closing_tag_start], \
509 html[whole_start:offset + next_closing_tag_end]
510 raise compat_HTMLParseError('unexpected end of html')
511
512
513 class HTMLAttributeParser(html.parser.HTMLParser):
514 """Trivial HTML parser to gather the attributes for a single element"""
515
516 def __init__(self):
517 self.attrs = {}
518 html.parser.HTMLParser.__init__(self)
519
520 def handle_starttag(self, tag, attrs):
521 self.attrs = dict(attrs)
522
523
524 class HTMLListAttrsParser(html.parser.HTMLParser):
525 """HTML parser to gather the attributes for the elements of a list"""
526
527 def __init__(self):
528 html.parser.HTMLParser.__init__(self)
529 self.items = []
530 self._level = 0
531
532 def handle_starttag(self, tag, attrs):
533 if tag == 'li' and self._level == 0:
534 self.items.append(dict(attrs))
535 self._level += 1
536
537 def handle_endtag(self, tag):
538 self._level -= 1
539
540
541 def extract_attributes(html_element):
542 """Given a string for an HTML element such as
543 <el
544 a="foo" B="bar" c="&98;az" d=boz
545 empty= noval entity="&amp;"
546 sq='"' dq="'"
547 >
548 Decode and return a dictionary of attributes.
549 {
550 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
551 'empty': '', 'noval': None, 'entity': '&',
552 'sq': '"', 'dq': '\''
553 }.
554 """
555 parser = HTMLAttributeParser()
556 with contextlib.suppress(compat_HTMLParseError):
557 parser.feed(html_element)
558 parser.close()
559 return parser.attrs
560
561
562 def parse_list(webpage):
563 """Given a string for an series of HTML <li> elements,
564 return a dictionary of their attributes"""
565 parser = HTMLListAttrsParser()
566 parser.feed(webpage)
567 parser.close()
568 return parser.items
569
570
571 def clean_html(html):
572 """Clean an HTML snippet into a readable string"""
573
574 if html is None: # Convenience for sanitizing descriptions etc.
575 return html
576
577 html = re.sub(r'\s+', ' ', html)
578 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
579 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
580 # Strip html tags
581 html = re.sub('<.*?>', '', html)
582 # Replace html entities
583 html = unescapeHTML(html)
584 return html.strip()
585
586
587 class LenientJSONDecoder(json.JSONDecoder):
588 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
589 self.transform_source, self.ignore_extra = transform_source, ignore_extra
590 super().__init__(*args, **kwargs)
591
592 def decode(self, s):
593 if self.transform_source:
594 s = self.transform_source(s)
595 try:
596 if self.ignore_extra:
597 return self.raw_decode(s.lstrip())[0]
598 return super().decode(s)
599 except json.JSONDecodeError as e:
600 if e.pos is not None:
601 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
602 raise
603
604
605 def sanitize_open(filename, open_mode):
606 """Try to open the given filename, and slightly tweak it if this fails.
607
608 Attempts to open the given filename. If this fails, it tries to change
609 the filename slightly, step by step, until it's either able to open it
610 or it fails and raises a final exception, like the standard open()
611 function.
612
613 It returns the tuple (stream, definitive_file_name).
614 """
615 if filename == '-':
616 if sys.platform == 'win32':
617 import msvcrt
618
619 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
620 with contextlib.suppress(io.UnsupportedOperation):
621 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
622 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
623
624 for attempt in range(2):
625 try:
626 try:
627 if sys.platform == 'win32':
628 # FIXME: An exclusive lock also locks the file from being read.
629 # Since windows locks are mandatory, don't lock the file on windows (for now).
630 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
631 raise LockingUnsupportedError()
632 stream = locked_file(filename, open_mode, block=False).__enter__()
633 except OSError:
634 stream = open(filename, open_mode)
635 return stream, filename
636 except OSError as err:
637 if attempt or err.errno in (errno.EACCES,):
638 raise
639 old_filename, filename = filename, sanitize_path(filename)
640 if old_filename == filename:
641 raise
642
643
644 def timeconvert(timestr):
645 """Convert RFC 2822 defined time string into system timestamp"""
646 timestamp = None
647 timetuple = email.utils.parsedate_tz(timestr)
648 if timetuple is not None:
649 timestamp = email.utils.mktime_tz(timetuple)
650 return timestamp
651
652
653 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
654 """Sanitizes a string so it could be used as part of a filename.
655 @param restricted Use a stricter subset of allowed characters
656 @param is_id Whether this is an ID that should be kept unchanged if possible.
657 If unset, yt-dlp's new sanitization rules are in effect
658 """
659 if s == '':
660 return ''
661
662 def replace_insane(char):
663 if restricted and char in ACCENT_CHARS:
664 return ACCENT_CHARS[char]
665 elif not restricted and char == '\n':
666 return '\0 '
667 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
668 # Replace with their full-width unicode counterparts
669 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
670 elif char == '?' or ord(char) < 32 or ord(char) == 127:
671 return ''
672 elif char == '"':
673 return '' if restricted else '\''
674 elif char == ':':
675 return '\0_\0-' if restricted else '\0 \0-'
676 elif char in '\\/|*<>':
677 return '\0_'
678 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
679 return '\0_'
680 return char
681
682 if restricted and is_id is NO_DEFAULT:
683 s = unicodedata.normalize('NFKC', s)
684 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
685 result = ''.join(map(replace_insane, s))
686 if is_id is NO_DEFAULT:
687 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
688 STRIP_RE = r'(?:\0.|[ _-])*'
689 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
690 result = result.replace('\0', '') or '_'
691
692 if not is_id:
693 while '__' in result:
694 result = result.replace('__', '_')
695 result = result.strip('_')
696 # Common case of "Foreign band name - English song title"
697 if restricted and result.startswith('-_'):
698 result = result[2:]
699 if result.startswith('-'):
700 result = '_' + result[len('-'):]
701 result = result.lstrip('.')
702 if not result:
703 result = '_'
704 return result
705
706
707 def sanitize_path(s, force=False):
708 """Sanitizes and normalizes path on Windows"""
709 if sys.platform == 'win32':
710 force = False
711 drive_or_unc, _ = os.path.splitdrive(s)
712 elif force:
713 drive_or_unc = ''
714 else:
715 return s
716
717 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
718 if drive_or_unc:
719 norm_path.pop(0)
720 sanitized_path = [
721 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
722 for path_part in norm_path]
723 if drive_or_unc:
724 sanitized_path.insert(0, drive_or_unc + os.path.sep)
725 elif force and s and s[0] == os.path.sep:
726 sanitized_path.insert(0, os.path.sep)
727 return os.path.join(*sanitized_path)
728
729
730 def sanitize_url(url, *, scheme='http'):
731 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
732 # the number of unwanted failures due to missing protocol
733 if url is None:
734 return
735 elif url.startswith('//'):
736 return f'{scheme}:{url}'
737 # Fix some common typos seen so far
738 COMMON_TYPOS = (
739 # https://github.com/ytdl-org/youtube-dl/issues/15649
740 (r'^httpss://', r'https://'),
741 # https://bx1.be/lives/direct-tv/
742 (r'^rmtp([es]?)://', r'rtmp\1://'),
743 )
744 for mistake, fixup in COMMON_TYPOS:
745 if re.match(mistake, url):
746 return re.sub(mistake, fixup, url)
747 return url
748
749
750 def extract_basic_auth(url):
751 parts = urllib.parse.urlsplit(url)
752 if parts.username is None:
753 return url, None
754 url = urllib.parse.urlunsplit(parts._replace(netloc=(
755 parts.hostname if parts.port is None
756 else '%s:%d' % (parts.hostname, parts.port))))
757 auth_payload = base64.b64encode(
758 ('%s:%s' % (parts.username, parts.password or '')).encode())
759 return url, f'Basic {auth_payload.decode()}'
760
761
762 def sanitized_Request(url, *args, **kwargs):
763 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
764 if auth_header is not None:
765 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
766 headers['Authorization'] = auth_header
767 return urllib.request.Request(url, *args, **kwargs)
768
769
770 def expand_path(s):
771 """Expand shell variables and ~"""
772 return os.path.expandvars(compat_expanduser(s))
773
774
775 def orderedSet(iterable, *, lazy=False):
776 """Remove all duplicates from the input iterable"""
777 def _iter():
778 seen = [] # Do not use set since the items can be unhashable
779 for x in iterable:
780 if x not in seen:
781 seen.append(x)
782 yield x
783
784 return _iter() if lazy else list(_iter())
785
786
787 def _htmlentity_transform(entity_with_semicolon):
788 """Transforms an HTML entity to a character."""
789 entity = entity_with_semicolon[:-1]
790
791 # Known non-numeric HTML entity
792 if entity in html.entities.name2codepoint:
793 return chr(html.entities.name2codepoint[entity])
794
795 # TODO: HTML5 allows entities without a semicolon.
796 # E.g. '&Eacuteric' should be decoded as 'Éric'.
797 if entity_with_semicolon in html.entities.html5:
798 return html.entities.html5[entity_with_semicolon]
799
800 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
801 if mobj is not None:
802 numstr = mobj.group(1)
803 if numstr.startswith('x'):
804 base = 16
805 numstr = '0%s' % numstr
806 else:
807 base = 10
808 # See https://github.com/ytdl-org/youtube-dl/issues/7518
809 with contextlib.suppress(ValueError):
810 return chr(int(numstr, base))
811
812 # Unknown entity in name, return its literal representation
813 return '&%s;' % entity
814
815
816 def unescapeHTML(s):
817 if s is None:
818 return None
819 assert isinstance(s, str)
820
821 return re.sub(
822 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
823
824
825 def escapeHTML(text):
826 return (
827 text
828 .replace('&', '&amp;')
829 .replace('<', '&lt;')
830 .replace('>', '&gt;')
831 .replace('"', '&quot;')
832 .replace("'", '&#39;')
833 )
834
835
836 def process_communicate_or_kill(p, *args, **kwargs):
837 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
838 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
839 return Popen.communicate_or_kill(p, *args, **kwargs)
840
841
842 class Popen(subprocess.Popen):
843 if sys.platform == 'win32':
844 _startupinfo = subprocess.STARTUPINFO()
845 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
846 else:
847 _startupinfo = None
848
849 @staticmethod
850 def _fix_pyinstaller_ld_path(env):
851 """Restore LD_LIBRARY_PATH when using PyInstaller
852 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
853 https://github.com/yt-dlp/yt-dlp/issues/4573
854 """
855 if not hasattr(sys, '_MEIPASS'):
856 return
857
858 def _fix(key):
859 orig = env.get(f'{key}_ORIG')
860 if orig is None:
861 env.pop(key, None)
862 else:
863 env[key] = orig
864
865 _fix('LD_LIBRARY_PATH') # Linux
866 _fix('DYLD_LIBRARY_PATH') # macOS
867
868 def __init__(self, *args, env=None, text=False, **kwargs):
869 if env is None:
870 env = os.environ.copy()
871 self._fix_pyinstaller_ld_path(env)
872
873 if text is True:
874 kwargs['universal_newlines'] = True # For 3.6 compatibility
875 kwargs.setdefault('encoding', 'utf-8')
876 kwargs.setdefault('errors', 'replace')
877 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
878
879 def communicate_or_kill(self, *args, **kwargs):
880 try:
881 return self.communicate(*args, **kwargs)
882 except BaseException: # Including KeyboardInterrupt
883 self.kill(timeout=None)
884 raise
885
886 def kill(self, *, timeout=0):
887 super().kill()
888 if timeout != 0:
889 self.wait(timeout=timeout)
890
891 @classmethod
892 def run(cls, *args, timeout=None, **kwargs):
893 with cls(*args, **kwargs) as proc:
894 default = '' if proc.text_mode else b''
895 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
896 return stdout or default, stderr or default, proc.returncode
897
898
899 def get_subprocess_encoding():
900 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
901 # For subprocess calls, encode with locale encoding
902 # Refer to http://stackoverflow.com/a/9951851/35070
903 encoding = preferredencoding()
904 else:
905 encoding = sys.getfilesystemencoding()
906 if encoding is None:
907 encoding = 'utf-8'
908 return encoding
909
910
911 def encodeFilename(s, for_subprocess=False):
912 assert isinstance(s, str)
913 return s
914
915
916 def decodeFilename(b, for_subprocess=False):
917 return b
918
919
920 def encodeArgument(s):
921 # Legacy code that uses byte strings
922 # Uncomment the following line after fixing all post processors
923 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
924 return s if isinstance(s, str) else s.decode('ascii')
925
926
927 def decodeArgument(b):
928 return b
929
930
931 def decodeOption(optval):
932 if optval is None:
933 return optval
934 if isinstance(optval, bytes):
935 optval = optval.decode(preferredencoding())
936
937 assert isinstance(optval, str)
938 return optval
939
940
941 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
942
943
944 def timetuple_from_msec(msec):
945 secs, msec = divmod(msec, 1000)
946 mins, secs = divmod(secs, 60)
947 hrs, mins = divmod(mins, 60)
948 return _timetuple(hrs, mins, secs, msec)
949
950
951 def formatSeconds(secs, delim=':', msec=False):
952 time = timetuple_from_msec(secs * 1000)
953 if time.hours:
954 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
955 elif time.minutes:
956 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
957 else:
958 ret = '%d' % time.seconds
959 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
960
961
962 def _ssl_load_windows_store_certs(ssl_context, storename):
963 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
964 try:
965 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
966 if encoding == 'x509_asn' and (
967 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
968 except PermissionError:
969 return
970 for cert in certs:
971 with contextlib.suppress(ssl.SSLError):
972 ssl_context.load_verify_locations(cadata=cert)
973
974
975 def make_HTTPS_handler(params, **kwargs):
976 opts_check_certificate = not params.get('nocheckcertificate')
977 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
978 context.check_hostname = opts_check_certificate
979 if params.get('legacyserverconnect'):
980 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
981 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
982 context.set_ciphers('DEFAULT')
983
984 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
985 if opts_check_certificate:
986 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
987 context.load_verify_locations(cafile=certifi.where())
988 else:
989 try:
990 context.load_default_certs()
991 # Work around the issue in load_default_certs when there are bad certificates. See:
992 # https://github.com/yt-dlp/yt-dlp/issues/1060,
993 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
994 except ssl.SSLError:
995 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
996 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
997 for storename in ('CA', 'ROOT'):
998 _ssl_load_windows_store_certs(context, storename)
999 context.set_default_verify_paths()
1000
1001 client_certfile = params.get('client_certificate')
1002 if client_certfile:
1003 try:
1004 context.load_cert_chain(
1005 client_certfile, keyfile=params.get('client_certificate_key'),
1006 password=params.get('client_certificate_password'))
1007 except ssl.SSLError:
1008 raise YoutubeDLError('Unable to load client certificate')
1009
1010 # Some servers may reject requests if ALPN extension is not sent. See:
1011 # https://github.com/python/cpython/issues/85140
1012 # https://github.com/yt-dlp/yt-dlp/issues/3878
1013 with contextlib.suppress(NotImplementedError):
1014 context.set_alpn_protocols(['http/1.1'])
1015
1016 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1017
1018
1019 def bug_reports_message(before=';'):
1020 from .update import REPOSITORY
1021
1022 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1023 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1024
1025 before = before.rstrip()
1026 if not before or before.endswith(('.', '!', '?')):
1027 msg = msg[0].title() + msg[1:]
1028
1029 return (before + ' ' if before else '') + msg
1030
1031
1032 class YoutubeDLError(Exception):
1033 """Base exception for YoutubeDL errors."""
1034 msg = None
1035
1036 def __init__(self, msg=None):
1037 if msg is not None:
1038 self.msg = msg
1039 elif self.msg is None:
1040 self.msg = type(self).__name__
1041 super().__init__(self.msg)
1042
1043
1044 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1045 if hasattr(ssl, 'CertificateError'):
1046 network_exceptions.append(ssl.CertificateError)
1047 network_exceptions = tuple(network_exceptions)
1048
1049
1050 class ExtractorError(YoutubeDLError):
1051 """Error during info extraction."""
1052
1053 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1054 """ tb, if given, is the original traceback (so that it can be printed out).
1055 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1056 """
1057 if sys.exc_info()[0] in network_exceptions:
1058 expected = True
1059
1060 self.orig_msg = str(msg)
1061 self.traceback = tb
1062 self.expected = expected
1063 self.cause = cause
1064 self.video_id = video_id
1065 self.ie = ie
1066 self.exc_info = sys.exc_info() # preserve original exception
1067 if isinstance(self.exc_info[1], ExtractorError):
1068 self.exc_info = self.exc_info[1].exc_info
1069
1070 super().__init__(''.join((
1071 format_field(ie, None, '[%s] '),
1072 format_field(video_id, None, '%s: '),
1073 msg,
1074 format_field(cause, None, ' (caused by %r)'),
1075 '' if expected else bug_reports_message())))
1076
1077 def format_traceback(self):
1078 return join_nonempty(
1079 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1080 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1081 delim='\n') or None
1082
1083
1084 class UnsupportedError(ExtractorError):
1085 def __init__(self, url):
1086 super().__init__(
1087 'Unsupported URL: %s' % url, expected=True)
1088 self.url = url
1089
1090
1091 class RegexNotFoundError(ExtractorError):
1092 """Error when a regex didn't match"""
1093 pass
1094
1095
1096 class GeoRestrictedError(ExtractorError):
1097 """Geographic restriction Error exception.
1098
1099 This exception may be thrown when a video is not available from your
1100 geographic location due to geographic restrictions imposed by a website.
1101 """
1102
1103 def __init__(self, msg, countries=None, **kwargs):
1104 kwargs['expected'] = True
1105 super().__init__(msg, **kwargs)
1106 self.countries = countries
1107
1108
1109 class UserNotLive(ExtractorError):
1110 """Error when a channel/user is not live"""
1111
1112 def __init__(self, msg=None, **kwargs):
1113 kwargs['expected'] = True
1114 super().__init__(msg or 'The channel is not currently live', **kwargs)
1115
1116
1117 class DownloadError(YoutubeDLError):
1118 """Download Error exception.
1119
1120 This exception may be thrown by FileDownloader objects if they are not
1121 configured to continue on errors. They will contain the appropriate
1122 error message.
1123 """
1124
1125 def __init__(self, msg, exc_info=None):
1126 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1127 super().__init__(msg)
1128 self.exc_info = exc_info
1129
1130
1131 class EntryNotInPlaylist(YoutubeDLError):
1132 """Entry not in playlist exception.
1133
1134 This exception will be thrown by YoutubeDL when a requested entry
1135 is not found in the playlist info_dict
1136 """
1137 msg = 'Entry not found in info'
1138
1139
1140 class SameFileError(YoutubeDLError):
1141 """Same File exception.
1142
1143 This exception will be thrown by FileDownloader objects if they detect
1144 multiple files would have to be downloaded to the same file on disk.
1145 """
1146 msg = 'Fixed output name but more than one file to download'
1147
1148 def __init__(self, filename=None):
1149 if filename is not None:
1150 self.msg += f': {filename}'
1151 super().__init__(self.msg)
1152
1153
1154 class PostProcessingError(YoutubeDLError):
1155 """Post Processing exception.
1156
1157 This exception may be raised by PostProcessor's .run() method to
1158 indicate an error in the postprocessing task.
1159 """
1160
1161
1162 class DownloadCancelled(YoutubeDLError):
1163 """ Exception raised when the download queue should be interrupted """
1164 msg = 'The download was cancelled'
1165
1166
1167 class ExistingVideoReached(DownloadCancelled):
1168 """ --break-on-existing triggered """
1169 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1170
1171
1172 class RejectedVideoReached(DownloadCancelled):
1173 """ --break-on-reject triggered """
1174 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1175
1176
1177 class MaxDownloadsReached(DownloadCancelled):
1178 """ --max-downloads limit has been reached. """
1179 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1180
1181
1182 class ReExtractInfo(YoutubeDLError):
1183 """ Video info needs to be re-extracted. """
1184
1185 def __init__(self, msg, expected=False):
1186 super().__init__(msg)
1187 self.expected = expected
1188
1189
1190 class ThrottledDownload(ReExtractInfo):
1191 """ Download speed below --throttled-rate. """
1192 msg = 'The download speed is below throttle limit'
1193
1194 def __init__(self):
1195 super().__init__(self.msg, expected=False)
1196
1197
1198 class UnavailableVideoError(YoutubeDLError):
1199 """Unavailable Format exception.
1200
1201 This exception will be thrown when a video is requested
1202 in a format that is not available for that video.
1203 """
1204 msg = 'Unable to download video'
1205
1206 def __init__(self, err=None):
1207 if err is not None:
1208 self.msg += f': {err}'
1209 super().__init__(self.msg)
1210
1211
1212 class ContentTooShortError(YoutubeDLError):
1213 """Content Too Short exception.
1214
1215 This exception may be raised by FileDownloader objects when a file they
1216 download is too small for what the server announced first, indicating
1217 the connection was probably interrupted.
1218 """
1219
1220 def __init__(self, downloaded, expected):
1221 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1222 # Both in bytes
1223 self.downloaded = downloaded
1224 self.expected = expected
1225
1226
1227 class XAttrMetadataError(YoutubeDLError):
1228 def __init__(self, code=None, msg='Unknown error'):
1229 super().__init__(msg)
1230 self.code = code
1231 self.msg = msg
1232
1233 # Parsing code and msg
1234 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1235 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1236 self.reason = 'NO_SPACE'
1237 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1238 self.reason = 'VALUE_TOO_LONG'
1239 else:
1240 self.reason = 'NOT_SUPPORTED'
1241
1242
1243 class XAttrUnavailableError(YoutubeDLError):
1244 pass
1245
1246
1247 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1248 hc = http_class(*args, **kwargs)
1249 source_address = ydl_handler._params.get('source_address')
1250
1251 if source_address is not None:
1252 # This is to workaround _create_connection() from socket where it will try all
1253 # address data from getaddrinfo() including IPv6. This filters the result from
1254 # getaddrinfo() based on the source_address value.
1255 # This is based on the cpython socket.create_connection() function.
1256 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1257 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1258 host, port = address
1259 err = None
1260 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1261 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1262 ip_addrs = [addr for addr in addrs if addr[0] == af]
1263 if addrs and not ip_addrs:
1264 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1265 raise OSError(
1266 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1267 % (ip_version, source_address[0]))
1268 for res in ip_addrs:
1269 af, socktype, proto, canonname, sa = res
1270 sock = None
1271 try:
1272 sock = socket.socket(af, socktype, proto)
1273 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1274 sock.settimeout(timeout)
1275 sock.bind(source_address)
1276 sock.connect(sa)
1277 err = None # Explicitly break reference cycle
1278 return sock
1279 except OSError as _:
1280 err = _
1281 if sock is not None:
1282 sock.close()
1283 if err is not None:
1284 raise err
1285 else:
1286 raise OSError('getaddrinfo returns an empty list')
1287 if hasattr(hc, '_create_connection'):
1288 hc._create_connection = _create_connection
1289 hc.source_address = (source_address, 0)
1290
1291 return hc
1292
1293
1294 def handle_youtubedl_headers(headers):
1295 filtered_headers = headers
1296
1297 if 'Youtubedl-no-compression' in filtered_headers:
1298 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1299 del filtered_headers['Youtubedl-no-compression']
1300
1301 return filtered_headers
1302
1303
1304 class YoutubeDLHandler(urllib.request.HTTPHandler):
1305 """Handler for HTTP requests and responses.
1306
1307 This class, when installed with an OpenerDirector, automatically adds
1308 the standard headers to every HTTP request and handles gzipped and
1309 deflated responses from web servers. If compression is to be avoided in
1310 a particular request, the original request in the program code only has
1311 to include the HTTP header "Youtubedl-no-compression", which will be
1312 removed before making the real request.
1313
1314 Part of this code was copied from:
1315
1316 http://techknack.net/python-urllib2-handlers/
1317
1318 Andrew Rowls, the author of that code, agreed to release it to the
1319 public domain.
1320 """
1321
1322 def __init__(self, params, *args, **kwargs):
1323 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1324 self._params = params
1325
1326 def http_open(self, req):
1327 conn_class = http.client.HTTPConnection
1328
1329 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1330 if socks_proxy:
1331 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1332 del req.headers['Ytdl-socks-proxy']
1333
1334 return self.do_open(functools.partial(
1335 _create_http_connection, self, conn_class, False),
1336 req)
1337
1338 @staticmethod
1339 def deflate(data):
1340 if not data:
1341 return data
1342 try:
1343 return zlib.decompress(data, -zlib.MAX_WBITS)
1344 except zlib.error:
1345 return zlib.decompress(data)
1346
1347 @staticmethod
1348 def brotli(data):
1349 if not data:
1350 return data
1351 return brotli.decompress(data)
1352
1353 def http_request(self, req):
1354 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1355 # always respected by websites, some tend to give out URLs with non percent-encoded
1356 # non-ASCII characters (see telemb.py, ard.py [#3412])
1357 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1358 # To work around aforementioned issue we will replace request's original URL with
1359 # percent-encoded one
1360 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1361 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1362 url = req.get_full_url()
1363 url_escaped = escape_url(url)
1364
1365 # Substitute URL if any change after escaping
1366 if url != url_escaped:
1367 req = update_Request(req, url=url_escaped)
1368
1369 for h, v in self._params.get('http_headers', std_headers).items():
1370 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1371 # The dict keys are capitalized because of this bug by urllib
1372 if h.capitalize() not in req.headers:
1373 req.add_header(h, v)
1374
1375 if 'Accept-encoding' not in req.headers:
1376 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1377
1378 req.headers = handle_youtubedl_headers(req.headers)
1379
1380 return super().do_request_(req)
1381
1382 def http_response(self, req, resp):
1383 old_resp = resp
1384 # gzip
1385 if resp.headers.get('Content-encoding', '') == 'gzip':
1386 content = resp.read()
1387 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1388 try:
1389 uncompressed = io.BytesIO(gz.read())
1390 except OSError as original_ioerror:
1391 # There may be junk add the end of the file
1392 # See http://stackoverflow.com/q/4928560/35070 for details
1393 for i in range(1, 1024):
1394 try:
1395 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1396 uncompressed = io.BytesIO(gz.read())
1397 except OSError:
1398 continue
1399 break
1400 else:
1401 raise original_ioerror
1402 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1403 resp.msg = old_resp.msg
1404 del resp.headers['Content-encoding']
1405 # deflate
1406 if resp.headers.get('Content-encoding', '') == 'deflate':
1407 gz = io.BytesIO(self.deflate(resp.read()))
1408 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1409 resp.msg = old_resp.msg
1410 del resp.headers['Content-encoding']
1411 # brotli
1412 if resp.headers.get('Content-encoding', '') == 'br':
1413 resp = urllib.request.addinfourl(
1414 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1415 resp.msg = old_resp.msg
1416 del resp.headers['Content-encoding']
1417 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1418 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1419 if 300 <= resp.code < 400:
1420 location = resp.headers.get('Location')
1421 if location:
1422 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1423 location = location.encode('iso-8859-1').decode()
1424 location_escaped = escape_url(location)
1425 if location != location_escaped:
1426 del resp.headers['Location']
1427 resp.headers['Location'] = location_escaped
1428 return resp
1429
1430 https_request = http_request
1431 https_response = http_response
1432
1433
1434 def make_socks_conn_class(base_class, socks_proxy):
1435 assert issubclass(base_class, (
1436 http.client.HTTPConnection, http.client.HTTPSConnection))
1437
1438 url_components = urllib.parse.urlparse(socks_proxy)
1439 if url_components.scheme.lower() == 'socks5':
1440 socks_type = ProxyType.SOCKS5
1441 elif url_components.scheme.lower() in ('socks', 'socks4'):
1442 socks_type = ProxyType.SOCKS4
1443 elif url_components.scheme.lower() == 'socks4a':
1444 socks_type = ProxyType.SOCKS4A
1445
1446 def unquote_if_non_empty(s):
1447 if not s:
1448 return s
1449 return urllib.parse.unquote_plus(s)
1450
1451 proxy_args = (
1452 socks_type,
1453 url_components.hostname, url_components.port or 1080,
1454 True, # Remote DNS
1455 unquote_if_non_empty(url_components.username),
1456 unquote_if_non_empty(url_components.password),
1457 )
1458
1459 class SocksConnection(base_class):
1460 def connect(self):
1461 self.sock = sockssocket()
1462 self.sock.setproxy(*proxy_args)
1463 if isinstance(self.timeout, (int, float)):
1464 self.sock.settimeout(self.timeout)
1465 self.sock.connect((self.host, self.port))
1466
1467 if isinstance(self, http.client.HTTPSConnection):
1468 if hasattr(self, '_context'): # Python > 2.6
1469 self.sock = self._context.wrap_socket(
1470 self.sock, server_hostname=self.host)
1471 else:
1472 self.sock = ssl.wrap_socket(self.sock)
1473
1474 return SocksConnection
1475
1476
1477 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1478 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1479 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1480 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1481 self._params = params
1482
1483 def https_open(self, req):
1484 kwargs = {}
1485 conn_class = self._https_conn_class
1486
1487 if hasattr(self, '_context'): # python > 2.6
1488 kwargs['context'] = self._context
1489 if hasattr(self, '_check_hostname'): # python 3.x
1490 kwargs['check_hostname'] = self._check_hostname
1491
1492 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1493 if socks_proxy:
1494 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1495 del req.headers['Ytdl-socks-proxy']
1496
1497 try:
1498 return self.do_open(
1499 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1500 except urllib.error.URLError as e:
1501 if (isinstance(e.reason, ssl.SSLError)
1502 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1503 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1504 raise
1505
1506
1507 def is_path_like(f):
1508 return isinstance(f, (str, bytes, os.PathLike))
1509
1510
1511 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1512 """
1513 See [1] for cookie file format.
1514
1515 1. https://curl.haxx.se/docs/http-cookies.html
1516 """
1517 _HTTPONLY_PREFIX = '#HttpOnly_'
1518 _ENTRY_LEN = 7
1519 _HEADER = '''# Netscape HTTP Cookie File
1520 # This file is generated by yt-dlp. Do not edit.
1521
1522 '''
1523 _CookieFileEntry = collections.namedtuple(
1524 'CookieFileEntry',
1525 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1526
1527 def __init__(self, filename=None, *args, **kwargs):
1528 super().__init__(None, *args, **kwargs)
1529 if is_path_like(filename):
1530 filename = os.fspath(filename)
1531 self.filename = filename
1532
1533 @staticmethod
1534 def _true_or_false(cndn):
1535 return 'TRUE' if cndn else 'FALSE'
1536
1537 @contextlib.contextmanager
1538 def open(self, file, *, write=False):
1539 if is_path_like(file):
1540 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1541 yield f
1542 else:
1543 if write:
1544 file.truncate(0)
1545 yield file
1546
1547 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1548 now = time.time()
1549 for cookie in self:
1550 if (not ignore_discard and cookie.discard
1551 or not ignore_expires and cookie.is_expired(now)):
1552 continue
1553 name, value = cookie.name, cookie.value
1554 if value is None:
1555 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1556 # with no name, whereas http.cookiejar regards it as a
1557 # cookie with no value.
1558 name, value = '', name
1559 f.write('%s\n' % '\t'.join((
1560 cookie.domain,
1561 self._true_or_false(cookie.domain.startswith('.')),
1562 cookie.path,
1563 self._true_or_false(cookie.secure),
1564 str_or_none(cookie.expires, default=''),
1565 name, value
1566 )))
1567
1568 def save(self, filename=None, *args, **kwargs):
1569 """
1570 Save cookies to a file.
1571 Code is taken from CPython 3.6
1572 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1573
1574 if filename is None:
1575 if self.filename is not None:
1576 filename = self.filename
1577 else:
1578 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1579
1580 # Store session cookies with `expires` set to 0 instead of an empty string
1581 for cookie in self:
1582 if cookie.expires is None:
1583 cookie.expires = 0
1584
1585 with self.open(filename, write=True) as f:
1586 f.write(self._HEADER)
1587 self._really_save(f, *args, **kwargs)
1588
1589 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1590 """Load cookies from a file."""
1591 if filename is None:
1592 if self.filename is not None:
1593 filename = self.filename
1594 else:
1595 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1596
1597 def prepare_line(line):
1598 if line.startswith(self._HTTPONLY_PREFIX):
1599 line = line[len(self._HTTPONLY_PREFIX):]
1600 # comments and empty lines are fine
1601 if line.startswith('#') or not line.strip():
1602 return line
1603 cookie_list = line.split('\t')
1604 if len(cookie_list) != self._ENTRY_LEN:
1605 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1606 cookie = self._CookieFileEntry(*cookie_list)
1607 if cookie.expires_at and not cookie.expires_at.isdigit():
1608 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1609 return line
1610
1611 cf = io.StringIO()
1612 with self.open(filename) as f:
1613 for line in f:
1614 try:
1615 cf.write(prepare_line(line))
1616 except http.cookiejar.LoadError as e:
1617 if f'{line.strip()} '[0] in '[{"':
1618 raise http.cookiejar.LoadError(
1619 'Cookies file must be Netscape formatted, not JSON. See '
1620 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1621 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1622 continue
1623 cf.seek(0)
1624 self._really_load(cf, filename, ignore_discard, ignore_expires)
1625 # Session cookies are denoted by either `expires` field set to
1626 # an empty string or 0. MozillaCookieJar only recognizes the former
1627 # (see [1]). So we need force the latter to be recognized as session
1628 # cookies on our own.
1629 # Session cookies may be important for cookies-based authentication,
1630 # e.g. usually, when user does not check 'Remember me' check box while
1631 # logging in on a site, some important cookies are stored as session
1632 # cookies so that not recognizing them will result in failed login.
1633 # 1. https://bugs.python.org/issue17164
1634 for cookie in self:
1635 # Treat `expires=0` cookies as session cookies
1636 if cookie.expires == 0:
1637 cookie.expires = None
1638 cookie.discard = True
1639
1640
1641 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1642 def __init__(self, cookiejar=None):
1643 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1644
1645 def http_response(self, request, response):
1646 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1647
1648 https_request = urllib.request.HTTPCookieProcessor.http_request
1649 https_response = http_response
1650
1651
1652 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1653 """YoutubeDL redirect handler
1654
1655 The code is based on HTTPRedirectHandler implementation from CPython [1].
1656
1657 This redirect handler solves two issues:
1658 - ensures redirect URL is always unicode under python 2
1659 - introduces support for experimental HTTP response status code
1660 308 Permanent Redirect [2] used by some sites [3]
1661
1662 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1663 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1664 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1665 """
1666
1667 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1668
1669 def redirect_request(self, req, fp, code, msg, headers, newurl):
1670 """Return a Request or None in response to a redirect.
1671
1672 This is called by the http_error_30x methods when a
1673 redirection response is received. If a redirection should
1674 take place, return a new Request to allow http_error_30x to
1675 perform the redirect. Otherwise, raise HTTPError if no-one
1676 else should try to handle this url. Return None if you can't
1677 but another Handler might.
1678 """
1679 m = req.get_method()
1680 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1681 or code in (301, 302, 303) and m == "POST")):
1682 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1683 # Strictly (according to RFC 2616), 301 or 302 in response to
1684 # a POST MUST NOT cause a redirection without confirmation
1685 # from the user (of urllib.request, in this case). In practice,
1686 # essentially all clients do redirect in this case, so we do
1687 # the same.
1688
1689 # Be conciliant with URIs containing a space. This is mainly
1690 # redundant with the more complete encoding done in http_error_302(),
1691 # but it is kept for compatibility with other callers.
1692 newurl = newurl.replace(' ', '%20')
1693
1694 CONTENT_HEADERS = ("content-length", "content-type")
1695 # NB: don't use dict comprehension for python 2.6 compatibility
1696 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1697
1698 # A 303 must either use GET or HEAD for subsequent request
1699 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1700 if code == 303 and m != 'HEAD':
1701 m = 'GET'
1702 # 301 and 302 redirects are commonly turned into a GET from a POST
1703 # for subsequent requests by browsers, so we'll do the same.
1704 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1705 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1706 if code in (301, 302) and m == 'POST':
1707 m = 'GET'
1708
1709 return urllib.request.Request(
1710 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1711 unverifiable=True, method=m)
1712
1713
1714 def extract_timezone(date_str):
1715 m = re.search(
1716 r'''(?x)
1717 ^.{8,}? # >=8 char non-TZ prefix, if present
1718 (?P<tz>Z| # just the UTC Z, or
1719 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1720 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1721 [ ]? # optional space
1722 (?P<sign>\+|-) # +/-
1723 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1724 $)
1725 ''', date_str)
1726 if not m:
1727 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1728 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1729 if timezone is not None:
1730 date_str = date_str[:-len(m.group('tz'))]
1731 timezone = datetime.timedelta(hours=timezone or 0)
1732 else:
1733 date_str = date_str[:-len(m.group('tz'))]
1734 if not m.group('sign'):
1735 timezone = datetime.timedelta()
1736 else:
1737 sign = 1 if m.group('sign') == '+' else -1
1738 timezone = datetime.timedelta(
1739 hours=sign * int(m.group('hours')),
1740 minutes=sign * int(m.group('minutes')))
1741 return timezone, date_str
1742
1743
1744 def parse_iso8601(date_str, delimiter='T', timezone=None):
1745 """ Return a UNIX timestamp from the given date """
1746
1747 if date_str is None:
1748 return None
1749
1750 date_str = re.sub(r'\.[0-9]+', '', date_str)
1751
1752 if timezone is None:
1753 timezone, date_str = extract_timezone(date_str)
1754
1755 with contextlib.suppress(ValueError):
1756 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1757 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1758 return calendar.timegm(dt.timetuple())
1759
1760
1761 def date_formats(day_first=True):
1762 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1763
1764
1765 def unified_strdate(date_str, day_first=True):
1766 """Return a string with the date in the format YYYYMMDD"""
1767
1768 if date_str is None:
1769 return None
1770 upload_date = None
1771 # Replace commas
1772 date_str = date_str.replace(',', ' ')
1773 # Remove AM/PM + timezone
1774 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1775 _, date_str = extract_timezone(date_str)
1776
1777 for expression in date_formats(day_first):
1778 with contextlib.suppress(ValueError):
1779 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1780 if upload_date is None:
1781 timetuple = email.utils.parsedate_tz(date_str)
1782 if timetuple:
1783 with contextlib.suppress(ValueError):
1784 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1785 if upload_date is not None:
1786 return str(upload_date)
1787
1788
1789 def unified_timestamp(date_str, day_first=True):
1790 if date_str is None:
1791 return None
1792
1793 date_str = re.sub(r'\s+', ' ', re.sub(
1794 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1795
1796 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1797 timezone, date_str = extract_timezone(date_str)
1798
1799 # Remove AM/PM + timezone
1800 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1801
1802 # Remove unrecognized timezones from ISO 8601 alike timestamps
1803 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1804 if m:
1805 date_str = date_str[:-len(m.group('tz'))]
1806
1807 # Python only supports microseconds, so remove nanoseconds
1808 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1809 if m:
1810 date_str = m.group(1)
1811
1812 for expression in date_formats(day_first):
1813 with contextlib.suppress(ValueError):
1814 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1815 return calendar.timegm(dt.timetuple())
1816
1817 timetuple = email.utils.parsedate_tz(date_str)
1818 if timetuple:
1819 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1820
1821
1822 def determine_ext(url, default_ext='unknown_video'):
1823 if url is None or '.' not in url:
1824 return default_ext
1825 guess = url.partition('?')[0].rpartition('.')[2]
1826 if re.match(r'^[A-Za-z0-9]+$', guess):
1827 return guess
1828 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1829 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1830 return guess.rstrip('/')
1831 else:
1832 return default_ext
1833
1834
1835 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1836 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1837
1838
1839 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1840 R"""
1841 Return a datetime object from a string.
1842 Supported format:
1843 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1844
1845 @param format strftime format of DATE
1846 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1847 auto: round to the unit provided in date_str (if applicable).
1848 """
1849 auto_precision = False
1850 if precision == 'auto':
1851 auto_precision = True
1852 precision = 'microsecond'
1853 today = datetime_round(datetime.datetime.utcnow(), precision)
1854 if date_str in ('now', 'today'):
1855 return today
1856 if date_str == 'yesterday':
1857 return today - datetime.timedelta(days=1)
1858 match = re.match(
1859 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1860 date_str)
1861 if match is not None:
1862 start_time = datetime_from_str(match.group('start'), precision, format)
1863 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1864 unit = match.group('unit')
1865 if unit == 'month' or unit == 'year':
1866 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1867 unit = 'day'
1868 else:
1869 if unit == 'week':
1870 unit = 'day'
1871 time *= 7
1872 delta = datetime.timedelta(**{unit + 's': time})
1873 new_date = start_time + delta
1874 if auto_precision:
1875 return datetime_round(new_date, unit)
1876 return new_date
1877
1878 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1879
1880
1881 def date_from_str(date_str, format='%Y%m%d', strict=False):
1882 R"""
1883 Return a date object from a string using datetime_from_str
1884
1885 @param strict Restrict allowed patterns to "YYYYMMDD" and
1886 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1887 """
1888 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1889 raise ValueError(f'Invalid date format "{date_str}"')
1890 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1891
1892
1893 def datetime_add_months(dt, months):
1894 """Increment/Decrement a datetime object by months."""
1895 month = dt.month + months - 1
1896 year = dt.year + month // 12
1897 month = month % 12 + 1
1898 day = min(dt.day, calendar.monthrange(year, month)[1])
1899 return dt.replace(year, month, day)
1900
1901
1902 def datetime_round(dt, precision='day'):
1903 """
1904 Round a datetime object's time to a specific precision
1905 """
1906 if precision == 'microsecond':
1907 return dt
1908
1909 unit_seconds = {
1910 'day': 86400,
1911 'hour': 3600,
1912 'minute': 60,
1913 'second': 1,
1914 }
1915 roundto = lambda x, n: ((x + n / 2) // n) * n
1916 timestamp = calendar.timegm(dt.timetuple())
1917 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1918
1919
1920 def hyphenate_date(date_str):
1921 """
1922 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1923 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1924 if match is not None:
1925 return '-'.join(match.groups())
1926 else:
1927 return date_str
1928
1929
1930 class DateRange:
1931 """Represents a time interval between two dates"""
1932
1933 def __init__(self, start=None, end=None):
1934 """start and end must be strings in the format accepted by date"""
1935 if start is not None:
1936 self.start = date_from_str(start, strict=True)
1937 else:
1938 self.start = datetime.datetime.min.date()
1939 if end is not None:
1940 self.end = date_from_str(end, strict=True)
1941 else:
1942 self.end = datetime.datetime.max.date()
1943 if self.start > self.end:
1944 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1945
1946 @classmethod
1947 def day(cls, day):
1948 """Returns a range that only contains the given day"""
1949 return cls(day, day)
1950
1951 def __contains__(self, date):
1952 """Check if the date is in the range"""
1953 if not isinstance(date, datetime.date):
1954 date = date_from_str(date)
1955 return self.start <= date <= self.end
1956
1957 def __str__(self):
1958 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1959
1960 def __eq__(self, other):
1961 return (isinstance(other, DateRange)
1962 and self.start == other.start and self.end == other.end)
1963
1964
1965 def platform_name():
1966 """ Returns the platform name as a str """
1967 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
1968 return platform.platform()
1969
1970
1971 @functools.cache
1972 def system_identifier():
1973 python_implementation = platform.python_implementation()
1974 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1975 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1976 libc_ver = []
1977 with contextlib.suppress(OSError): # We may not have access to the executable
1978 libc_ver = platform.libc_ver()
1979
1980 return 'Python %s (%s %s) - %s %s' % (
1981 platform.python_version(),
1982 python_implementation,
1983 platform.architecture()[0],
1984 platform.platform(),
1985 format_field(join_nonempty(*libc_ver, delim=' '), None, '(%s)'),
1986 )
1987
1988
1989 @functools.cache
1990 def get_windows_version():
1991 ''' Get Windows version. returns () if it's not running on Windows '''
1992 if compat_os_name == 'nt':
1993 return version_tuple(platform.win32_ver()[1])
1994 else:
1995 return ()
1996
1997
1998 def write_string(s, out=None, encoding=None):
1999 assert isinstance(s, str)
2000 out = out or sys.stderr
2001
2002 if compat_os_name == 'nt' and supports_terminal_sequences(out):
2003 s = re.sub(r'([\r\n]+)', r' \1', s)
2004
2005 enc, buffer = None, out
2006 if 'b' in getattr(out, 'mode', ''):
2007 enc = encoding or preferredencoding()
2008 elif hasattr(out, 'buffer'):
2009 buffer = out.buffer
2010 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2011
2012 buffer.write(s.encode(enc, 'ignore') if enc else s)
2013 out.flush()
2014
2015
2016 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2017 from . import _IN_CLI
2018 if _IN_CLI:
2019 if msg in deprecation_warning._cache:
2020 return
2021 deprecation_warning._cache.add(msg)
2022 if printer:
2023 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2024 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2025 else:
2026 import warnings
2027 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2028
2029
2030 deprecation_warning._cache = set()
2031
2032
2033 def bytes_to_intlist(bs):
2034 if not bs:
2035 return []
2036 if isinstance(bs[0], int): # Python 3
2037 return list(bs)
2038 else:
2039 return [ord(c) for c in bs]
2040
2041
2042 def intlist_to_bytes(xs):
2043 if not xs:
2044 return b''
2045 return struct.pack('%dB' % len(xs), *xs)
2046
2047
2048 class LockingUnsupportedError(OSError):
2049 msg = 'File locking is not supported'
2050
2051 def __init__(self):
2052 super().__init__(self.msg)
2053
2054
2055 # Cross-platform file locking
2056 if sys.platform == 'win32':
2057 import ctypes
2058 import ctypes.wintypes
2059 import msvcrt
2060
2061 class OVERLAPPED(ctypes.Structure):
2062 _fields_ = [
2063 ('Internal', ctypes.wintypes.LPVOID),
2064 ('InternalHigh', ctypes.wintypes.LPVOID),
2065 ('Offset', ctypes.wintypes.DWORD),
2066 ('OffsetHigh', ctypes.wintypes.DWORD),
2067 ('hEvent', ctypes.wintypes.HANDLE),
2068 ]
2069
2070 kernel32 = ctypes.windll.kernel32
2071 LockFileEx = kernel32.LockFileEx
2072 LockFileEx.argtypes = [
2073 ctypes.wintypes.HANDLE, # hFile
2074 ctypes.wintypes.DWORD, # dwFlags
2075 ctypes.wintypes.DWORD, # dwReserved
2076 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2077 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2078 ctypes.POINTER(OVERLAPPED) # Overlapped
2079 ]
2080 LockFileEx.restype = ctypes.wintypes.BOOL
2081 UnlockFileEx = kernel32.UnlockFileEx
2082 UnlockFileEx.argtypes = [
2083 ctypes.wintypes.HANDLE, # hFile
2084 ctypes.wintypes.DWORD, # dwReserved
2085 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2086 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2087 ctypes.POINTER(OVERLAPPED) # Overlapped
2088 ]
2089 UnlockFileEx.restype = ctypes.wintypes.BOOL
2090 whole_low = 0xffffffff
2091 whole_high = 0x7fffffff
2092
2093 def _lock_file(f, exclusive, block):
2094 overlapped = OVERLAPPED()
2095 overlapped.Offset = 0
2096 overlapped.OffsetHigh = 0
2097 overlapped.hEvent = 0
2098 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2099
2100 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2101 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2102 0, whole_low, whole_high, f._lock_file_overlapped_p):
2103 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2104 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2105
2106 def _unlock_file(f):
2107 assert f._lock_file_overlapped_p
2108 handle = msvcrt.get_osfhandle(f.fileno())
2109 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2110 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2111
2112 else:
2113 try:
2114 import fcntl
2115
2116 def _lock_file(f, exclusive, block):
2117 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2118 if not block:
2119 flags |= fcntl.LOCK_NB
2120 try:
2121 fcntl.flock(f, flags)
2122 except BlockingIOError:
2123 raise
2124 except OSError: # AOSP does not have flock()
2125 fcntl.lockf(f, flags)
2126
2127 def _unlock_file(f):
2128 try:
2129 fcntl.flock(f, fcntl.LOCK_UN)
2130 except OSError:
2131 fcntl.lockf(f, fcntl.LOCK_UN)
2132
2133 except ImportError:
2134
2135 def _lock_file(f, exclusive, block):
2136 raise LockingUnsupportedError()
2137
2138 def _unlock_file(f):
2139 raise LockingUnsupportedError()
2140
2141
2142 class locked_file:
2143 locked = False
2144
2145 def __init__(self, filename, mode, block=True, encoding=None):
2146 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2147 raise NotImplementedError(mode)
2148 self.mode, self.block = mode, block
2149
2150 writable = any(f in mode for f in 'wax+')
2151 readable = any(f in mode for f in 'r+')
2152 flags = functools.reduce(operator.ior, (
2153 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2154 getattr(os, 'O_BINARY', 0), # Windows only
2155 getattr(os, 'O_NOINHERIT', 0), # Windows only
2156 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2157 os.O_APPEND if 'a' in mode else 0,
2158 os.O_EXCL if 'x' in mode else 0,
2159 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2160 ))
2161
2162 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2163
2164 def __enter__(self):
2165 exclusive = 'r' not in self.mode
2166 try:
2167 _lock_file(self.f, exclusive, self.block)
2168 self.locked = True
2169 except OSError:
2170 self.f.close()
2171 raise
2172 if 'w' in self.mode:
2173 try:
2174 self.f.truncate()
2175 except OSError as e:
2176 if e.errno not in (
2177 errno.ESPIPE, # Illegal seek - expected for FIFO
2178 errno.EINVAL, # Invalid argument - expected for /dev/null
2179 ):
2180 raise
2181 return self
2182
2183 def unlock(self):
2184 if not self.locked:
2185 return
2186 try:
2187 _unlock_file(self.f)
2188 finally:
2189 self.locked = False
2190
2191 def __exit__(self, *_):
2192 try:
2193 self.unlock()
2194 finally:
2195 self.f.close()
2196
2197 open = __enter__
2198 close = __exit__
2199
2200 def __getattr__(self, attr):
2201 return getattr(self.f, attr)
2202
2203 def __iter__(self):
2204 return iter(self.f)
2205
2206
2207 @functools.cache
2208 def get_filesystem_encoding():
2209 encoding = sys.getfilesystemencoding()
2210 return encoding if encoding is not None else 'utf-8'
2211
2212
2213 def shell_quote(args):
2214 quoted_args = []
2215 encoding = get_filesystem_encoding()
2216 for a in args:
2217 if isinstance(a, bytes):
2218 # We may get a filename encoded with 'encodeFilename'
2219 a = a.decode(encoding)
2220 quoted_args.append(compat_shlex_quote(a))
2221 return ' '.join(quoted_args)
2222
2223
2224 def smuggle_url(url, data):
2225 """ Pass additional data in a URL for internal use. """
2226
2227 url, idata = unsmuggle_url(url, {})
2228 data.update(idata)
2229 sdata = urllib.parse.urlencode(
2230 {'__youtubedl_smuggle': json.dumps(data)})
2231 return url + '#' + sdata
2232
2233
2234 def unsmuggle_url(smug_url, default=None):
2235 if '#__youtubedl_smuggle' not in smug_url:
2236 return smug_url, default
2237 url, _, sdata = smug_url.rpartition('#')
2238 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2239 data = json.loads(jsond)
2240 return url, data
2241
2242
2243 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2244 """ Formats numbers with decimal sufixes like K, M, etc """
2245 num, factor = float_or_none(num), float(factor)
2246 if num is None or num < 0:
2247 return None
2248 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2249 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2250 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2251 if factor == 1024:
2252 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2253 converted = num / (factor ** exponent)
2254 return fmt % (converted, suffix)
2255
2256
2257 def format_bytes(bytes):
2258 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2259
2260
2261 def lookup_unit_table(unit_table, s):
2262 units_re = '|'.join(re.escape(u) for u in unit_table)
2263 m = re.match(
2264 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2265 if not m:
2266 return None
2267 num_str = m.group('num').replace(',', '.')
2268 mult = unit_table[m.group('unit')]
2269 return int(float(num_str) * mult)
2270
2271
2272 def parse_filesize(s):
2273 if s is None:
2274 return None
2275
2276 # The lower-case forms are of course incorrect and unofficial,
2277 # but we support those too
2278 _UNIT_TABLE = {
2279 'B': 1,
2280 'b': 1,
2281 'bytes': 1,
2282 'KiB': 1024,
2283 'KB': 1000,
2284 'kB': 1024,
2285 'Kb': 1000,
2286 'kb': 1000,
2287 'kilobytes': 1000,
2288 'kibibytes': 1024,
2289 'MiB': 1024 ** 2,
2290 'MB': 1000 ** 2,
2291 'mB': 1024 ** 2,
2292 'Mb': 1000 ** 2,
2293 'mb': 1000 ** 2,
2294 'megabytes': 1000 ** 2,
2295 'mebibytes': 1024 ** 2,
2296 'GiB': 1024 ** 3,
2297 'GB': 1000 ** 3,
2298 'gB': 1024 ** 3,
2299 'Gb': 1000 ** 3,
2300 'gb': 1000 ** 3,
2301 'gigabytes': 1000 ** 3,
2302 'gibibytes': 1024 ** 3,
2303 'TiB': 1024 ** 4,
2304 'TB': 1000 ** 4,
2305 'tB': 1024 ** 4,
2306 'Tb': 1000 ** 4,
2307 'tb': 1000 ** 4,
2308 'terabytes': 1000 ** 4,
2309 'tebibytes': 1024 ** 4,
2310 'PiB': 1024 ** 5,
2311 'PB': 1000 ** 5,
2312 'pB': 1024 ** 5,
2313 'Pb': 1000 ** 5,
2314 'pb': 1000 ** 5,
2315 'petabytes': 1000 ** 5,
2316 'pebibytes': 1024 ** 5,
2317 'EiB': 1024 ** 6,
2318 'EB': 1000 ** 6,
2319 'eB': 1024 ** 6,
2320 'Eb': 1000 ** 6,
2321 'eb': 1000 ** 6,
2322 'exabytes': 1000 ** 6,
2323 'exbibytes': 1024 ** 6,
2324 'ZiB': 1024 ** 7,
2325 'ZB': 1000 ** 7,
2326 'zB': 1024 ** 7,
2327 'Zb': 1000 ** 7,
2328 'zb': 1000 ** 7,
2329 'zettabytes': 1000 ** 7,
2330 'zebibytes': 1024 ** 7,
2331 'YiB': 1024 ** 8,
2332 'YB': 1000 ** 8,
2333 'yB': 1024 ** 8,
2334 'Yb': 1000 ** 8,
2335 'yb': 1000 ** 8,
2336 'yottabytes': 1000 ** 8,
2337 'yobibytes': 1024 ** 8,
2338 }
2339
2340 return lookup_unit_table(_UNIT_TABLE, s)
2341
2342
2343 def parse_count(s):
2344 if s is None:
2345 return None
2346
2347 s = re.sub(r'^[^\d]+\s', '', s).strip()
2348
2349 if re.match(r'^[\d,.]+$', s):
2350 return str_to_int(s)
2351
2352 _UNIT_TABLE = {
2353 'k': 1000,
2354 'K': 1000,
2355 'm': 1000 ** 2,
2356 'M': 1000 ** 2,
2357 'kk': 1000 ** 2,
2358 'KK': 1000 ** 2,
2359 'b': 1000 ** 3,
2360 'B': 1000 ** 3,
2361 }
2362
2363 ret = lookup_unit_table(_UNIT_TABLE, s)
2364 if ret is not None:
2365 return ret
2366
2367 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2368 if mobj:
2369 return str_to_int(mobj.group(1))
2370
2371
2372 def parse_resolution(s, *, lenient=False):
2373 if s is None:
2374 return {}
2375
2376 if lenient:
2377 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2378 else:
2379 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2380 if mobj:
2381 return {
2382 'width': int(mobj.group('w')),
2383 'height': int(mobj.group('h')),
2384 }
2385
2386 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2387 if mobj:
2388 return {'height': int(mobj.group(1))}
2389
2390 mobj = re.search(r'\b([48])[kK]\b', s)
2391 if mobj:
2392 return {'height': int(mobj.group(1)) * 540}
2393
2394 return {}
2395
2396
2397 def parse_bitrate(s):
2398 if not isinstance(s, str):
2399 return
2400 mobj = re.search(r'\b(\d+)\s*kbps', s)
2401 if mobj:
2402 return int(mobj.group(1))
2403
2404
2405 def month_by_name(name, lang='en'):
2406 """ Return the number of a month by (locale-independently) English name """
2407
2408 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2409
2410 try:
2411 return month_names.index(name) + 1
2412 except ValueError:
2413 return None
2414
2415
2416 def month_by_abbreviation(abbrev):
2417 """ Return the number of a month by (locale-independently) English
2418 abbreviations """
2419
2420 try:
2421 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2422 except ValueError:
2423 return None
2424
2425
2426 def fix_xml_ampersands(xml_str):
2427 """Replace all the '&' by '&amp;' in XML"""
2428 return re.sub(
2429 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2430 '&amp;',
2431 xml_str)
2432
2433
2434 def setproctitle(title):
2435 assert isinstance(title, str)
2436
2437 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2438 try:
2439 import ctypes
2440 except ImportError:
2441 return
2442
2443 try:
2444 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2445 except OSError:
2446 return
2447 except TypeError:
2448 # LoadLibrary in Windows Python 2.7.13 only expects
2449 # a bytestring, but since unicode_literals turns
2450 # every string into a unicode string, it fails.
2451 return
2452 title_bytes = title.encode()
2453 buf = ctypes.create_string_buffer(len(title_bytes))
2454 buf.value = title_bytes
2455 try:
2456 libc.prctl(15, buf, 0, 0, 0)
2457 except AttributeError:
2458 return # Strange libc, just skip this
2459
2460
2461 def remove_start(s, start):
2462 return s[len(start):] if s is not None and s.startswith(start) else s
2463
2464
2465 def remove_end(s, end):
2466 return s[:-len(end)] if s is not None and s.endswith(end) else s
2467
2468
2469 def remove_quotes(s):
2470 if s is None or len(s) < 2:
2471 return s
2472 for quote in ('"', "'", ):
2473 if s[0] == quote and s[-1] == quote:
2474 return s[1:-1]
2475 return s
2476
2477
2478 def get_domain(url):
2479 """
2480 This implementation is inconsistent, but is kept for compatibility.
2481 Use this only for "webpage_url_domain"
2482 """
2483 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2484
2485
2486 def url_basename(url):
2487 path = urllib.parse.urlparse(url).path
2488 return path.strip('/').split('/')[-1]
2489
2490
2491 def base_url(url):
2492 return re.match(r'https?://[^?#]+/', url).group()
2493
2494
2495 def urljoin(base, path):
2496 if isinstance(path, bytes):
2497 path = path.decode()
2498 if not isinstance(path, str) or not path:
2499 return None
2500 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2501 return path
2502 if isinstance(base, bytes):
2503 base = base.decode()
2504 if not isinstance(base, str) or not re.match(
2505 r'^(?:https?:)?//', base):
2506 return None
2507 return urllib.parse.urljoin(base, path)
2508
2509
2510 class HEADRequest(urllib.request.Request):
2511 def get_method(self):
2512 return 'HEAD'
2513
2514
2515 class PUTRequest(urllib.request.Request):
2516 def get_method(self):
2517 return 'PUT'
2518
2519
2520 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2521 if get_attr and v is not None:
2522 v = getattr(v, get_attr, None)
2523 try:
2524 return int(v) * invscale // scale
2525 except (ValueError, TypeError, OverflowError):
2526 return default
2527
2528
2529 def str_or_none(v, default=None):
2530 return default if v is None else str(v)
2531
2532
2533 def str_to_int(int_str):
2534 """ A more relaxed version of int_or_none """
2535 if isinstance(int_str, int):
2536 return int_str
2537 elif isinstance(int_str, str):
2538 int_str = re.sub(r'[,\.\+]', '', int_str)
2539 return int_or_none(int_str)
2540
2541
2542 def float_or_none(v, scale=1, invscale=1, default=None):
2543 if v is None:
2544 return default
2545 try:
2546 return float(v) * invscale / scale
2547 except (ValueError, TypeError):
2548 return default
2549
2550
2551 def bool_or_none(v, default=None):
2552 return v if isinstance(v, bool) else default
2553
2554
2555 def strip_or_none(v, default=None):
2556 return v.strip() if isinstance(v, str) else default
2557
2558
2559 def url_or_none(url):
2560 if not url or not isinstance(url, str):
2561 return None
2562 url = url.strip()
2563 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2564
2565
2566 def request_to_url(req):
2567 if isinstance(req, urllib.request.Request):
2568 return req.get_full_url()
2569 else:
2570 return req
2571
2572
2573 def strftime_or_none(timestamp, date_format, default=None):
2574 datetime_object = None
2575 try:
2576 if isinstance(timestamp, (int, float)): # unix timestamp
2577 # Using naive datetime here can break timestamp() in Windows
2578 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2579 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2580 elif isinstance(timestamp, str): # assume YYYYMMDD
2581 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2582 date_format = re.sub( # Support %s on windows
2583 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2584 return datetime_object.strftime(date_format)
2585 except (ValueError, TypeError, AttributeError):
2586 return default
2587
2588
2589 def parse_duration(s):
2590 if not isinstance(s, str):
2591 return None
2592 s = s.strip()
2593 if not s:
2594 return None
2595
2596 days, hours, mins, secs, ms = [None] * 5
2597 m = re.match(r'''(?x)
2598 (?P<before_secs>
2599 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2600 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2601 (?P<ms>[.:][0-9]+)?Z?$
2602 ''', s)
2603 if m:
2604 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2605 else:
2606 m = re.match(
2607 r'''(?ix)(?:P?
2608 (?:
2609 [0-9]+\s*y(?:ears?)?,?\s*
2610 )?
2611 (?:
2612 [0-9]+\s*m(?:onths?)?,?\s*
2613 )?
2614 (?:
2615 [0-9]+\s*w(?:eeks?)?,?\s*
2616 )?
2617 (?:
2618 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2619 )?
2620 T)?
2621 (?:
2622 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2623 )?
2624 (?:
2625 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2626 )?
2627 (?:
2628 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2629 )?Z?$''', s)
2630 if m:
2631 days, hours, mins, secs, ms = m.groups()
2632 else:
2633 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2634 if m:
2635 hours, mins = m.groups()
2636 else:
2637 return None
2638
2639 if ms:
2640 ms = ms.replace(':', '.')
2641 return sum(float(part or 0) * mult for part, mult in (
2642 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2643
2644
2645 def prepend_extension(filename, ext, expected_real_ext=None):
2646 name, real_ext = os.path.splitext(filename)
2647 return (
2648 f'{name}.{ext}{real_ext}'
2649 if not expected_real_ext or real_ext[1:] == expected_real_ext
2650 else f'{filename}.{ext}')
2651
2652
2653 def replace_extension(filename, ext, expected_real_ext=None):
2654 name, real_ext = os.path.splitext(filename)
2655 return '{}.{}'.format(
2656 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2657 ext)
2658
2659
2660 def check_executable(exe, args=[]):
2661 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2662 args can be a list of arguments for a short output (like -version) """
2663 try:
2664 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2665 except OSError:
2666 return False
2667 return exe
2668
2669
2670 def _get_exe_version_output(exe, args, *, to_screen=None):
2671 if to_screen:
2672 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2673 try:
2674 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2675 # SIGTTOU if yt-dlp is run in the background.
2676 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2677 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2678 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2679 except OSError:
2680 return False
2681 return stdout
2682
2683
2684 def detect_exe_version(output, version_re=None, unrecognized='present'):
2685 assert isinstance(output, str)
2686 if version_re is None:
2687 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2688 m = re.search(version_re, output)
2689 if m:
2690 return m.group(1)
2691 else:
2692 return unrecognized
2693
2694
2695 def get_exe_version(exe, args=['--version'],
2696 version_re=None, unrecognized='present'):
2697 """ Returns the version of the specified executable,
2698 or False if the executable is not present """
2699 out = _get_exe_version_output(exe, args)
2700 return detect_exe_version(out, version_re, unrecognized) if out else False
2701
2702
2703 def frange(start=0, stop=None, step=1):
2704 """Float range"""
2705 if stop is None:
2706 start, stop = 0, start
2707 sign = [-1, 1][step > 0] if step else 0
2708 while sign * start < sign * stop:
2709 yield start
2710 start += step
2711
2712
2713 class LazyList(collections.abc.Sequence):
2714 """Lazy immutable list from an iterable
2715 Note that slices of a LazyList are lists and not LazyList"""
2716
2717 class IndexError(IndexError):
2718 pass
2719
2720 def __init__(self, iterable, *, reverse=False, _cache=None):
2721 self._iterable = iter(iterable)
2722 self._cache = [] if _cache is None else _cache
2723 self._reversed = reverse
2724
2725 def __iter__(self):
2726 if self._reversed:
2727 # We need to consume the entire iterable to iterate in reverse
2728 yield from self.exhaust()
2729 return
2730 yield from self._cache
2731 for item in self._iterable:
2732 self._cache.append(item)
2733 yield item
2734
2735 def _exhaust(self):
2736 self._cache.extend(self._iterable)
2737 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2738 return self._cache
2739
2740 def exhaust(self):
2741 """Evaluate the entire iterable"""
2742 return self._exhaust()[::-1 if self._reversed else 1]
2743
2744 @staticmethod
2745 def _reverse_index(x):
2746 return None if x is None else ~x
2747
2748 def __getitem__(self, idx):
2749 if isinstance(idx, slice):
2750 if self._reversed:
2751 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2752 start, stop, step = idx.start, idx.stop, idx.step or 1
2753 elif isinstance(idx, int):
2754 if self._reversed:
2755 idx = self._reverse_index(idx)
2756 start, stop, step = idx, idx, 0
2757 else:
2758 raise TypeError('indices must be integers or slices')
2759 if ((start or 0) < 0 or (stop or 0) < 0
2760 or (start is None and step < 0)
2761 or (stop is None and step > 0)):
2762 # We need to consume the entire iterable to be able to slice from the end
2763 # Obviously, never use this with infinite iterables
2764 self._exhaust()
2765 try:
2766 return self._cache[idx]
2767 except IndexError as e:
2768 raise self.IndexError(e) from e
2769 n = max(start or 0, stop or 0) - len(self._cache) + 1
2770 if n > 0:
2771 self._cache.extend(itertools.islice(self._iterable, n))
2772 try:
2773 return self._cache[idx]
2774 except IndexError as e:
2775 raise self.IndexError(e) from e
2776
2777 def __bool__(self):
2778 try:
2779 self[-1] if self._reversed else self[0]
2780 except self.IndexError:
2781 return False
2782 return True
2783
2784 def __len__(self):
2785 self._exhaust()
2786 return len(self._cache)
2787
2788 def __reversed__(self):
2789 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2790
2791 def __copy__(self):
2792 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2793
2794 def __repr__(self):
2795 # repr and str should mimic a list. So we exhaust the iterable
2796 return repr(self.exhaust())
2797
2798 def __str__(self):
2799 return repr(self.exhaust())
2800
2801
2802 class PagedList:
2803
2804 class IndexError(IndexError):
2805 pass
2806
2807 def __len__(self):
2808 # This is only useful for tests
2809 return len(self.getslice())
2810
2811 def __init__(self, pagefunc, pagesize, use_cache=True):
2812 self._pagefunc = pagefunc
2813 self._pagesize = pagesize
2814 self._pagecount = float('inf')
2815 self._use_cache = use_cache
2816 self._cache = {}
2817
2818 def getpage(self, pagenum):
2819 page_results = self._cache.get(pagenum)
2820 if page_results is None:
2821 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2822 if self._use_cache:
2823 self._cache[pagenum] = page_results
2824 return page_results
2825
2826 def getslice(self, start=0, end=None):
2827 return list(self._getslice(start, end))
2828
2829 def _getslice(self, start, end):
2830 raise NotImplementedError('This method must be implemented by subclasses')
2831
2832 def __getitem__(self, idx):
2833 assert self._use_cache, 'Indexing PagedList requires cache'
2834 if not isinstance(idx, int) or idx < 0:
2835 raise TypeError('indices must be non-negative integers')
2836 entries = self.getslice(idx, idx + 1)
2837 if not entries:
2838 raise self.IndexError()
2839 return entries[0]
2840
2841
2842 class OnDemandPagedList(PagedList):
2843 """Download pages until a page with less than maximum results"""
2844
2845 def _getslice(self, start, end):
2846 for pagenum in itertools.count(start // self._pagesize):
2847 firstid = pagenum * self._pagesize
2848 nextfirstid = pagenum * self._pagesize + self._pagesize
2849 if start >= nextfirstid:
2850 continue
2851
2852 startv = (
2853 start % self._pagesize
2854 if firstid <= start < nextfirstid
2855 else 0)
2856 endv = (
2857 ((end - 1) % self._pagesize) + 1
2858 if (end is not None and firstid <= end <= nextfirstid)
2859 else None)
2860
2861 try:
2862 page_results = self.getpage(pagenum)
2863 except Exception:
2864 self._pagecount = pagenum - 1
2865 raise
2866 if startv != 0 or endv is not None:
2867 page_results = page_results[startv:endv]
2868 yield from page_results
2869
2870 # A little optimization - if current page is not "full", ie. does
2871 # not contain page_size videos then we can assume that this page
2872 # is the last one - there are no more ids on further pages -
2873 # i.e. no need to query again.
2874 if len(page_results) + startv < self._pagesize:
2875 break
2876
2877 # If we got the whole page, but the next page is not interesting,
2878 # break out early as well
2879 if end == nextfirstid:
2880 break
2881
2882
2883 class InAdvancePagedList(PagedList):
2884 """PagedList with total number of pages known in advance"""
2885
2886 def __init__(self, pagefunc, pagecount, pagesize):
2887 PagedList.__init__(self, pagefunc, pagesize, True)
2888 self._pagecount = pagecount
2889
2890 def _getslice(self, start, end):
2891 start_page = start // self._pagesize
2892 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2893 skip_elems = start - start_page * self._pagesize
2894 only_more = None if end is None else end - start
2895 for pagenum in range(start_page, end_page):
2896 page_results = self.getpage(pagenum)
2897 if skip_elems:
2898 page_results = page_results[skip_elems:]
2899 skip_elems = None
2900 if only_more is not None:
2901 if len(page_results) < only_more:
2902 only_more -= len(page_results)
2903 else:
2904 yield from page_results[:only_more]
2905 break
2906 yield from page_results
2907
2908
2909 class PlaylistEntries:
2910 MissingEntry = object()
2911 is_exhausted = False
2912
2913 def __init__(self, ydl, info_dict):
2914 self.ydl = ydl
2915
2916 # _entries must be assigned now since infodict can change during iteration
2917 entries = info_dict.get('entries')
2918 if entries is None:
2919 raise EntryNotInPlaylist('There are no entries')
2920 elif isinstance(entries, list):
2921 self.is_exhausted = True
2922
2923 requested_entries = info_dict.get('requested_entries')
2924 self.is_incomplete = bool(requested_entries)
2925 if self.is_incomplete:
2926 assert self.is_exhausted
2927 self._entries = [self.MissingEntry] * max(requested_entries)
2928 for i, entry in zip(requested_entries, entries):
2929 self._entries[i - 1] = entry
2930 elif isinstance(entries, (list, PagedList, LazyList)):
2931 self._entries = entries
2932 else:
2933 self._entries = LazyList(entries)
2934
2935 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2936 (?P<start>[+-]?\d+)?
2937 (?P<range>[:-]
2938 (?P<end>[+-]?\d+|inf(?:inite)?)?
2939 (?::(?P<step>[+-]?\d+))?
2940 )?''')
2941
2942 @classmethod
2943 def parse_playlist_items(cls, string):
2944 for segment in string.split(','):
2945 if not segment:
2946 raise ValueError('There is two or more consecutive commas')
2947 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2948 if not mobj:
2949 raise ValueError(f'{segment!r} is not a valid specification')
2950 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2951 if int_or_none(step) == 0:
2952 raise ValueError(f'Step in {segment!r} cannot be zero')
2953 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2954
2955 def get_requested_items(self):
2956 playlist_items = self.ydl.params.get('playlist_items')
2957 playlist_start = self.ydl.params.get('playliststart', 1)
2958 playlist_end = self.ydl.params.get('playlistend')
2959 # For backwards compatibility, interpret -1 as whole list
2960 if playlist_end in (-1, None):
2961 playlist_end = ''
2962 if not playlist_items:
2963 playlist_items = f'{playlist_start}:{playlist_end}'
2964 elif playlist_start != 1 or playlist_end:
2965 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2966
2967 for index in self.parse_playlist_items(playlist_items):
2968 for i, entry in self[index]:
2969 yield i, entry
2970 if not entry:
2971 continue
2972 try:
2973 # TODO: Add auto-generated fields
2974 self.ydl._match_entry(entry, incomplete=True, silent=True)
2975 except (ExistingVideoReached, RejectedVideoReached):
2976 return
2977
2978 def get_full_count(self):
2979 if self.is_exhausted and not self.is_incomplete:
2980 return len(self)
2981 elif isinstance(self._entries, InAdvancePagedList):
2982 if self._entries._pagesize == 1:
2983 return self._entries._pagecount
2984
2985 @functools.cached_property
2986 def _getter(self):
2987 if isinstance(self._entries, list):
2988 def get_entry(i):
2989 try:
2990 entry = self._entries[i]
2991 except IndexError:
2992 entry = self.MissingEntry
2993 if not self.is_incomplete:
2994 raise self.IndexError()
2995 if entry is self.MissingEntry:
2996 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2997 return entry
2998 else:
2999 def get_entry(i):
3000 try:
3001 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3002 except (LazyList.IndexError, PagedList.IndexError):
3003 raise self.IndexError()
3004 return get_entry
3005
3006 def __getitem__(self, idx):
3007 if isinstance(idx, int):
3008 idx = slice(idx, idx)
3009
3010 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3011 step = 1 if idx.step is None else idx.step
3012 if idx.start is None:
3013 start = 0 if step > 0 else len(self) - 1
3014 else:
3015 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3016
3017 # NB: Do not call len(self) when idx == [:]
3018 if idx.stop is None:
3019 stop = 0 if step < 0 else float('inf')
3020 else:
3021 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3022 stop += [-1, 1][step > 0]
3023
3024 for i in frange(start, stop, step):
3025 if i < 0:
3026 continue
3027 try:
3028 entry = self._getter(i)
3029 except self.IndexError:
3030 self.is_exhausted = True
3031 if step > 0:
3032 break
3033 continue
3034 yield i + 1, entry
3035
3036 def __len__(self):
3037 return len(tuple(self[:]))
3038
3039 class IndexError(IndexError):
3040 pass
3041
3042
3043 def uppercase_escape(s):
3044 unicode_escape = codecs.getdecoder('unicode_escape')
3045 return re.sub(
3046 r'\\U[0-9a-fA-F]{8}',
3047 lambda m: unicode_escape(m.group(0))[0],
3048 s)
3049
3050
3051 def lowercase_escape(s):
3052 unicode_escape = codecs.getdecoder('unicode_escape')
3053 return re.sub(
3054 r'\\u[0-9a-fA-F]{4}',
3055 lambda m: unicode_escape(m.group(0))[0],
3056 s)
3057
3058
3059 def escape_rfc3986(s):
3060 """Escape non-ASCII characters as suggested by RFC 3986"""
3061 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3062
3063
3064 def escape_url(url):
3065 """Escape URL as suggested by RFC 3986"""
3066 url_parsed = urllib.parse.urlparse(url)
3067 return url_parsed._replace(
3068 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3069 path=escape_rfc3986(url_parsed.path),
3070 params=escape_rfc3986(url_parsed.params),
3071 query=escape_rfc3986(url_parsed.query),
3072 fragment=escape_rfc3986(url_parsed.fragment)
3073 ).geturl()
3074
3075
3076 def parse_qs(url):
3077 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3078
3079
3080 def read_batch_urls(batch_fd):
3081 def fixup(url):
3082 if not isinstance(url, str):
3083 url = url.decode('utf-8', 'replace')
3084 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3085 for bom in BOM_UTF8:
3086 if url.startswith(bom):
3087 url = url[len(bom):]
3088 url = url.lstrip()
3089 if not url or url.startswith(('#', ';', ']')):
3090 return False
3091 # "#" cannot be stripped out since it is part of the URI
3092 # However, it can be safely stripped out if following a whitespace
3093 return re.split(r'\s#', url, 1)[0].rstrip()
3094
3095 with contextlib.closing(batch_fd) as fd:
3096 return [url for url in map(fixup, fd) if url]
3097
3098
3099 def urlencode_postdata(*args, **kargs):
3100 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3101
3102
3103 def update_url_query(url, query):
3104 if not query:
3105 return url
3106 parsed_url = urllib.parse.urlparse(url)
3107 qs = urllib.parse.parse_qs(parsed_url.query)
3108 qs.update(query)
3109 return urllib.parse.urlunparse(parsed_url._replace(
3110 query=urllib.parse.urlencode(qs, True)))
3111
3112
3113 def update_Request(req, url=None, data=None, headers=None, query=None):
3114 req_headers = req.headers.copy()
3115 req_headers.update(headers or {})
3116 req_data = data or req.data
3117 req_url = update_url_query(url or req.get_full_url(), query)
3118 req_get_method = req.get_method()
3119 if req_get_method == 'HEAD':
3120 req_type = HEADRequest
3121 elif req_get_method == 'PUT':
3122 req_type = PUTRequest
3123 else:
3124 req_type = urllib.request.Request
3125 new_req = req_type(
3126 req_url, data=req_data, headers=req_headers,
3127 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3128 if hasattr(req, 'timeout'):
3129 new_req.timeout = req.timeout
3130 return new_req
3131
3132
3133 def _multipart_encode_impl(data, boundary):
3134 content_type = 'multipart/form-data; boundary=%s' % boundary
3135
3136 out = b''
3137 for k, v in data.items():
3138 out += b'--' + boundary.encode('ascii') + b'\r\n'
3139 if isinstance(k, str):
3140 k = k.encode()
3141 if isinstance(v, str):
3142 v = v.encode()
3143 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3144 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3145 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3146 if boundary.encode('ascii') in content:
3147 raise ValueError('Boundary overlaps with data')
3148 out += content
3149
3150 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3151
3152 return out, content_type
3153
3154
3155 def multipart_encode(data, boundary=None):
3156 '''
3157 Encode a dict to RFC 7578-compliant form-data
3158
3159 data:
3160 A dict where keys and values can be either Unicode or bytes-like
3161 objects.
3162 boundary:
3163 If specified a Unicode object, it's used as the boundary. Otherwise
3164 a random boundary is generated.
3165
3166 Reference: https://tools.ietf.org/html/rfc7578
3167 '''
3168 has_specified_boundary = boundary is not None
3169
3170 while True:
3171 if boundary is None:
3172 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3173
3174 try:
3175 out, content_type = _multipart_encode_impl(data, boundary)
3176 break
3177 except ValueError:
3178 if has_specified_boundary:
3179 raise
3180 boundary = None
3181
3182 return out, content_type
3183
3184
3185 def variadic(x, allowed_types=(str, bytes, dict)):
3186 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3187
3188
3189 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3190 for val in map(d.get, variadic(key_or_keys)):
3191 if val is not None and (val or not skip_false_values):
3192 return val
3193 return default
3194
3195
3196 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3197 for f in funcs:
3198 try:
3199 val = f(*args, **kwargs)
3200 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3201 pass
3202 else:
3203 if expected_type is None or isinstance(val, expected_type):
3204 return val
3205
3206
3207 def try_get(src, getter, expected_type=None):
3208 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3209
3210
3211 def filter_dict(dct, cndn=lambda _, v: v is not None):
3212 return {k: v for k, v in dct.items() if cndn(k, v)}
3213
3214
3215 def merge_dicts(*dicts):
3216 merged = {}
3217 for a_dict in dicts:
3218 for k, v in a_dict.items():
3219 if (v is not None and k not in merged
3220 or isinstance(v, str) and merged[k] == ''):
3221 merged[k] = v
3222 return merged
3223
3224
3225 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3226 return string if isinstance(string, str) else str(string, encoding, errors)
3227
3228
3229 US_RATINGS = {
3230 'G': 0,
3231 'PG': 10,
3232 'PG-13': 13,
3233 'R': 16,
3234 'NC': 18,
3235 }
3236
3237
3238 TV_PARENTAL_GUIDELINES = {
3239 'TV-Y': 0,
3240 'TV-Y7': 7,
3241 'TV-G': 0,
3242 'TV-PG': 0,
3243 'TV-14': 14,
3244 'TV-MA': 17,
3245 }
3246
3247
3248 def parse_age_limit(s):
3249 # isinstance(False, int) is True. So type() must be used instead
3250 if type(s) is int: # noqa: E721
3251 return s if 0 <= s <= 21 else None
3252 elif not isinstance(s, str):
3253 return None
3254 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3255 if m:
3256 return int(m.group('age'))
3257 s = s.upper()
3258 if s in US_RATINGS:
3259 return US_RATINGS[s]
3260 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3261 if m:
3262 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3263 return None
3264
3265
3266 def strip_jsonp(code):
3267 return re.sub(
3268 r'''(?sx)^
3269 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3270 (?:\s*&&\s*(?P=func_name))?
3271 \s*\(\s*(?P<callback_data>.*)\);?
3272 \s*?(?://[^\n]*)*$''',
3273 r'\g<callback_data>', code)
3274
3275
3276 def js_to_json(code, vars={}, *, strict=False):
3277 # vars is a dict of var, val pairs to substitute
3278 STRING_QUOTES = '\'"'
3279 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3280 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3281 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3282 INTEGER_TABLE = (
3283 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3284 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3285 )
3286
3287 def process_escape(match):
3288 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3289 escape = match.group(1) or match.group(2)
3290
3291 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3292 else R'\u00' if escape == 'x'
3293 else '' if escape == '\n'
3294 else escape)
3295
3296 def fix_kv(m):
3297 v = m.group(0)
3298 if v in ('true', 'false', 'null'):
3299 return v
3300 elif v in ('undefined', 'void 0'):
3301 return 'null'
3302 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3303 return ''
3304
3305 if v[0] in STRING_QUOTES:
3306 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3307 return f'"{escaped}"'
3308
3309 for regex, base in INTEGER_TABLE:
3310 im = re.match(regex, v)
3311 if im:
3312 i = int(im.group(1), base)
3313 return f'"{i}":' if v.endswith(':') else str(i)
3314
3315 if v in vars:
3316 return json.dumps(vars[v])
3317
3318 if not strict:
3319 return f'"{v}"'
3320
3321 raise ValueError(f'Unknown value: {v}')
3322
3323 def create_map(mobj):
3324 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3325
3326 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3327 if not strict:
3328 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3329 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3330
3331 return re.sub(rf'''(?sx)
3332 {STRING_RE}|
3333 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3334 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3335 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3336 [0-9]+(?={SKIP_RE}:)|
3337 !+
3338 ''', fix_kv, code)
3339
3340
3341 def qualities(quality_ids):
3342 """ Get a numeric quality value out of a list of possible values """
3343 def q(qid):
3344 try:
3345 return quality_ids.index(qid)
3346 except ValueError:
3347 return -1
3348 return q
3349
3350
3351 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3352
3353
3354 DEFAULT_OUTTMPL = {
3355 'default': '%(title)s [%(id)s].%(ext)s',
3356 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3357 }
3358 OUTTMPL_TYPES = {
3359 'chapter': None,
3360 'subtitle': None,
3361 'thumbnail': None,
3362 'description': 'description',
3363 'annotation': 'annotations.xml',
3364 'infojson': 'info.json',
3365 'link': None,
3366 'pl_video': None,
3367 'pl_thumbnail': None,
3368 'pl_description': 'description',
3369 'pl_infojson': 'info.json',
3370 }
3371
3372 # As of [1] format syntax is:
3373 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3374 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3375 STR_FORMAT_RE_TMPL = r'''(?x)
3376 (?<!%)(?P<prefix>(?:%%)*)
3377 %
3378 (?P<has_key>\((?P<key>{0})\))?
3379 (?P<format>
3380 (?P<conversion>[#0\-+ ]+)?
3381 (?P<min_width>\d+)?
3382 (?P<precision>\.\d+)?
3383 (?P<len_mod>[hlL])? # unused in python
3384 {1} # conversion type
3385 )
3386 '''
3387
3388
3389 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3390
3391
3392 def limit_length(s, length):
3393 """ Add ellipses to overly long strings """
3394 if s is None:
3395 return None
3396 ELLIPSES = '...'
3397 if len(s) > length:
3398 return s[:length - len(ELLIPSES)] + ELLIPSES
3399 return s
3400
3401
3402 def version_tuple(v):
3403 return tuple(int(e) for e in re.split(r'[-.]', v))
3404
3405
3406 def is_outdated_version(version, limit, assume_new=True):
3407 if not version:
3408 return not assume_new
3409 try:
3410 return version_tuple(version) < version_tuple(limit)
3411 except ValueError:
3412 return not assume_new
3413
3414
3415 def ytdl_is_updateable():
3416 """ Returns if yt-dlp can be updated with -U """
3417
3418 from .update import is_non_updateable
3419
3420 return not is_non_updateable()
3421
3422
3423 def args_to_str(args):
3424 # Get a short string representation for a subprocess command
3425 return ' '.join(compat_shlex_quote(a) for a in args)
3426
3427
3428 def error_to_compat_str(err):
3429 return str(err)
3430
3431
3432 def error_to_str(err):
3433 return f'{type(err).__name__}: {err}'
3434
3435
3436 def mimetype2ext(mt):
3437 if mt is None:
3438 return None
3439
3440 mt, _, params = mt.partition(';')
3441 mt = mt.strip()
3442
3443 FULL_MAP = {
3444 'audio/mp4': 'm4a',
3445 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3446 # it's the most popular one
3447 'audio/mpeg': 'mp3',
3448 'audio/x-wav': 'wav',
3449 'audio/wav': 'wav',
3450 'audio/wave': 'wav',
3451 }
3452
3453 ext = FULL_MAP.get(mt)
3454 if ext is not None:
3455 return ext
3456
3457 SUBTYPE_MAP = {
3458 '3gpp': '3gp',
3459 'smptett+xml': 'tt',
3460 'ttaf+xml': 'dfxp',
3461 'ttml+xml': 'ttml',
3462 'x-flv': 'flv',
3463 'x-mp4-fragmented': 'mp4',
3464 'x-ms-sami': 'sami',
3465 'x-ms-wmv': 'wmv',
3466 'mpegurl': 'm3u8',
3467 'x-mpegurl': 'm3u8',
3468 'vnd.apple.mpegurl': 'm3u8',
3469 'dash+xml': 'mpd',
3470 'f4m+xml': 'f4m',
3471 'hds+xml': 'f4m',
3472 'vnd.ms-sstr+xml': 'ism',
3473 'quicktime': 'mov',
3474 'mp2t': 'ts',
3475 'x-wav': 'wav',
3476 'filmstrip+json': 'fs',
3477 'svg+xml': 'svg',
3478 }
3479
3480 _, _, subtype = mt.rpartition('/')
3481 ext = SUBTYPE_MAP.get(subtype.lower())
3482 if ext is not None:
3483 return ext
3484
3485 SUFFIX_MAP = {
3486 'json': 'json',
3487 'xml': 'xml',
3488 'zip': 'zip',
3489 'gzip': 'gz',
3490 }
3491
3492 _, _, suffix = subtype.partition('+')
3493 ext = SUFFIX_MAP.get(suffix)
3494 if ext is not None:
3495 return ext
3496
3497 return subtype.replace('+', '.')
3498
3499
3500 def ext2mimetype(ext_or_url):
3501 if not ext_or_url:
3502 return None
3503 if '.' not in ext_or_url:
3504 ext_or_url = f'file.{ext_or_url}'
3505 return mimetypes.guess_type(ext_or_url)[0]
3506
3507
3508 def parse_codecs(codecs_str):
3509 # http://tools.ietf.org/html/rfc6381
3510 if not codecs_str:
3511 return {}
3512 split_codecs = list(filter(None, map(
3513 str.strip, codecs_str.strip().strip(',').split(','))))
3514 vcodec, acodec, scodec, hdr = None, None, None, None
3515 for full_codec in split_codecs:
3516 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3517 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3518 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3519 if vcodec:
3520 continue
3521 vcodec = full_codec
3522 if parts[0] in ('dvh1', 'dvhe'):
3523 hdr = 'DV'
3524 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3525 hdr = 'HDR10'
3526 elif parts[:2] == ['vp9', '2']:
3527 hdr = 'HDR10'
3528 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3529 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3530 acodec = acodec or full_codec
3531 elif parts[0] in ('stpp', 'wvtt'):
3532 scodec = scodec or full_codec
3533 else:
3534 write_string(f'WARNING: Unknown codec {full_codec}\n')
3535 if vcodec or acodec or scodec:
3536 return {
3537 'vcodec': vcodec or 'none',
3538 'acodec': acodec or 'none',
3539 'dynamic_range': hdr,
3540 **({'scodec': scodec} if scodec is not None else {}),
3541 }
3542 elif len(split_codecs) == 2:
3543 return {
3544 'vcodec': split_codecs[0],
3545 'acodec': split_codecs[1],
3546 }
3547 return {}
3548
3549
3550 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3551 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3552
3553 allow_mkv = not preferences or 'mkv' in preferences
3554
3555 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3556 return 'mkv' # TODO: any other format allows this?
3557
3558 # TODO: All codecs supported by parse_codecs isn't handled here
3559 COMPATIBLE_CODECS = {
3560 'mp4': {
3561 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
3562 'h264', 'aacl', 'ec-3', # Set in ISM
3563 },
3564 'webm': {
3565 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3566 'vp9x', 'vp8x', # in the webm spec
3567 },
3568 }
3569
3570 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3571 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3572
3573 for ext in preferences or COMPATIBLE_CODECS.keys():
3574 codec_set = COMPATIBLE_CODECS.get(ext, set())
3575 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3576 return ext
3577
3578 COMPATIBLE_EXTS = (
3579 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3580 {'webm'},
3581 )
3582 for ext in preferences or vexts:
3583 current_exts = {ext, *vexts, *aexts}
3584 if ext == 'mkv' or current_exts == {ext} or any(
3585 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3586 return ext
3587 return 'mkv' if allow_mkv else preferences[-1]
3588
3589
3590 def urlhandle_detect_ext(url_handle):
3591 getheader = url_handle.headers.get
3592
3593 cd = getheader('Content-Disposition')
3594 if cd:
3595 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3596 if m:
3597 e = determine_ext(m.group('filename'), default_ext=None)
3598 if e:
3599 return e
3600
3601 return mimetype2ext(getheader('Content-Type'))
3602
3603
3604 def encode_data_uri(data, mime_type):
3605 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3606
3607
3608 def age_restricted(content_limit, age_limit):
3609 """ Returns True iff the content should be blocked """
3610
3611 if age_limit is None: # No limit set
3612 return False
3613 if content_limit is None:
3614 return False # Content available for everyone
3615 return age_limit < content_limit
3616
3617
3618 # List of known byte-order-marks (BOM)
3619 BOMS = [
3620 (b'\xef\xbb\xbf', 'utf-8'),
3621 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3622 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3623 (b'\xff\xfe', 'utf-16-le'),
3624 (b'\xfe\xff', 'utf-16-be'),
3625 ]
3626
3627
3628 def is_html(first_bytes):
3629 """ Detect whether a file contains HTML by examining its first bytes. """
3630
3631 encoding = 'utf-8'
3632 for bom, enc in BOMS:
3633 while first_bytes.startswith(bom):
3634 encoding, first_bytes = enc, first_bytes[len(bom):]
3635
3636 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3637
3638
3639 def determine_protocol(info_dict):
3640 protocol = info_dict.get('protocol')
3641 if protocol is not None:
3642 return protocol
3643
3644 url = sanitize_url(info_dict['url'])
3645 if url.startswith('rtmp'):
3646 return 'rtmp'
3647 elif url.startswith('mms'):
3648 return 'mms'
3649 elif url.startswith('rtsp'):
3650 return 'rtsp'
3651
3652 ext = determine_ext(url)
3653 if ext == 'm3u8':
3654 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3655 elif ext == 'f4m':
3656 return 'f4m'
3657
3658 return urllib.parse.urlparse(url).scheme
3659
3660
3661 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3662 """ Render a list of rows, each as a list of values.
3663 Text after a \t will be right aligned """
3664 def width(string):
3665 return len(remove_terminal_sequences(string).replace('\t', ''))
3666
3667 def get_max_lens(table):
3668 return [max(width(str(v)) for v in col) for col in zip(*table)]
3669
3670 def filter_using_list(row, filterArray):
3671 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3672
3673 max_lens = get_max_lens(data) if hide_empty else []
3674 header_row = filter_using_list(header_row, max_lens)
3675 data = [filter_using_list(row, max_lens) for row in data]
3676
3677 table = [header_row] + data
3678 max_lens = get_max_lens(table)
3679 extra_gap += 1
3680 if delim:
3681 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3682 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3683 for row in table:
3684 for pos, text in enumerate(map(str, row)):
3685 if '\t' in text:
3686 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3687 else:
3688 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3689 ret = '\n'.join(''.join(row).rstrip() for row in table)
3690 return ret
3691
3692
3693 def _match_one(filter_part, dct, incomplete):
3694 # TODO: Generalize code with YoutubeDL._build_format_filter
3695 STRING_OPERATORS = {
3696 '*=': operator.contains,
3697 '^=': lambda attr, value: attr.startswith(value),
3698 '$=': lambda attr, value: attr.endswith(value),
3699 '~=': lambda attr, value: re.search(value, attr),
3700 }
3701 COMPARISON_OPERATORS = {
3702 **STRING_OPERATORS,
3703 '<=': operator.le, # "<=" must be defined above "<"
3704 '<': operator.lt,
3705 '>=': operator.ge,
3706 '>': operator.gt,
3707 '=': operator.eq,
3708 }
3709
3710 if isinstance(incomplete, bool):
3711 is_incomplete = lambda _: incomplete
3712 else:
3713 is_incomplete = lambda k: k in incomplete
3714
3715 operator_rex = re.compile(r'''(?x)
3716 (?P<key>[a-z_]+)
3717 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3718 (?:
3719 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3720 (?P<strval>.+?)
3721 )
3722 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3723 m = operator_rex.fullmatch(filter_part.strip())
3724 if m:
3725 m = m.groupdict()
3726 unnegated_op = COMPARISON_OPERATORS[m['op']]
3727 if m['negation']:
3728 op = lambda attr, value: not unnegated_op(attr, value)
3729 else:
3730 op = unnegated_op
3731 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3732 if m['quote']:
3733 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3734 actual_value = dct.get(m['key'])
3735 numeric_comparison = None
3736 if isinstance(actual_value, (int, float)):
3737 # If the original field is a string and matching comparisonvalue is
3738 # a number we should respect the origin of the original field
3739 # and process comparison value as a string (see
3740 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3741 try:
3742 numeric_comparison = int(comparison_value)
3743 except ValueError:
3744 numeric_comparison = parse_filesize(comparison_value)
3745 if numeric_comparison is None:
3746 numeric_comparison = parse_filesize(f'{comparison_value}B')
3747 if numeric_comparison is None:
3748 numeric_comparison = parse_duration(comparison_value)
3749 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3750 raise ValueError('Operator %s only supports string values!' % m['op'])
3751 if actual_value is None:
3752 return is_incomplete(m['key']) or m['none_inclusive']
3753 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3754
3755 UNARY_OPERATORS = {
3756 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3757 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3758 }
3759 operator_rex = re.compile(r'''(?x)
3760 (?P<op>%s)\s*(?P<key>[a-z_]+)
3761 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3762 m = operator_rex.fullmatch(filter_part.strip())
3763 if m:
3764 op = UNARY_OPERATORS[m.group('op')]
3765 actual_value = dct.get(m.group('key'))
3766 if is_incomplete(m.group('key')) and actual_value is None:
3767 return True
3768 return op(actual_value)
3769
3770 raise ValueError('Invalid filter part %r' % filter_part)
3771
3772
3773 def match_str(filter_str, dct, incomplete=False):
3774 """ Filter a dictionary with a simple string syntax.
3775 @returns Whether the filter passes
3776 @param incomplete Set of keys that is expected to be missing from dct.
3777 Can be True/False to indicate all/none of the keys may be missing.
3778 All conditions on incomplete keys pass if the key is missing
3779 """
3780 return all(
3781 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3782 for filter_part in re.split(r'(?<!\\)&', filter_str))
3783
3784
3785 def match_filter_func(filters):
3786 if not filters:
3787 return None
3788 filters = set(variadic(filters))
3789
3790 interactive = '-' in filters
3791 if interactive:
3792 filters.remove('-')
3793
3794 def _match_func(info_dict, incomplete=False):
3795 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3796 return NO_DEFAULT if interactive and not incomplete else None
3797 else:
3798 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3799 filter_str = ') | ('.join(map(str.strip, filters))
3800 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3801 return _match_func
3802
3803
3804 class download_range_func:
3805 def __init__(self, chapters, ranges):
3806 self.chapters, self.ranges = chapters, ranges
3807
3808 def __call__(self, info_dict, ydl):
3809 if not self.ranges and not self.chapters:
3810 yield {}
3811
3812 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3813 else 'Cannot match chapters since chapter information is unavailable')
3814 for regex in self.chapters or []:
3815 for i, chapter in enumerate(info_dict.get('chapters') or []):
3816 if re.search(regex, chapter['title']):
3817 warning = None
3818 yield {**chapter, 'index': i}
3819 if self.chapters and warning:
3820 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3821
3822 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3823
3824 def __eq__(self, other):
3825 return (isinstance(other, download_range_func)
3826 and self.chapters == other.chapters and self.ranges == other.ranges)
3827
3828
3829 def parse_dfxp_time_expr(time_expr):
3830 if not time_expr:
3831 return
3832
3833 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3834 if mobj:
3835 return float(mobj.group('time_offset'))
3836
3837 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3838 if mobj:
3839 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3840
3841
3842 def srt_subtitles_timecode(seconds):
3843 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3844
3845
3846 def ass_subtitles_timecode(seconds):
3847 time = timetuple_from_msec(seconds * 1000)
3848 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3849
3850
3851 def dfxp2srt(dfxp_data):
3852 '''
3853 @param dfxp_data A bytes-like object containing DFXP data
3854 @returns A unicode object containing converted SRT data
3855 '''
3856 LEGACY_NAMESPACES = (
3857 (b'http://www.w3.org/ns/ttml', [
3858 b'http://www.w3.org/2004/11/ttaf1',
3859 b'http://www.w3.org/2006/04/ttaf1',
3860 b'http://www.w3.org/2006/10/ttaf1',
3861 ]),
3862 (b'http://www.w3.org/ns/ttml#styling', [
3863 b'http://www.w3.org/ns/ttml#style',
3864 ]),
3865 )
3866
3867 SUPPORTED_STYLING = [
3868 'color',
3869 'fontFamily',
3870 'fontSize',
3871 'fontStyle',
3872 'fontWeight',
3873 'textDecoration'
3874 ]
3875
3876 _x = functools.partial(xpath_with_ns, ns_map={
3877 'xml': 'http://www.w3.org/XML/1998/namespace',
3878 'ttml': 'http://www.w3.org/ns/ttml',
3879 'tts': 'http://www.w3.org/ns/ttml#styling',
3880 })
3881
3882 styles = {}
3883 default_style = {}
3884
3885 class TTMLPElementParser:
3886 _out = ''
3887 _unclosed_elements = []
3888 _applied_styles = []
3889
3890 def start(self, tag, attrib):
3891 if tag in (_x('ttml:br'), 'br'):
3892 self._out += '\n'
3893 else:
3894 unclosed_elements = []
3895 style = {}
3896 element_style_id = attrib.get('style')
3897 if default_style:
3898 style.update(default_style)
3899 if element_style_id:
3900 style.update(styles.get(element_style_id, {}))
3901 for prop in SUPPORTED_STYLING:
3902 prop_val = attrib.get(_x('tts:' + prop))
3903 if prop_val:
3904 style[prop] = prop_val
3905 if style:
3906 font = ''
3907 for k, v in sorted(style.items()):
3908 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3909 continue
3910 if k == 'color':
3911 font += ' color="%s"' % v
3912 elif k == 'fontSize':
3913 font += ' size="%s"' % v
3914 elif k == 'fontFamily':
3915 font += ' face="%s"' % v
3916 elif k == 'fontWeight' and v == 'bold':
3917 self._out += '<b>'
3918 unclosed_elements.append('b')
3919 elif k == 'fontStyle' and v == 'italic':
3920 self._out += '<i>'
3921 unclosed_elements.append('i')
3922 elif k == 'textDecoration' and v == 'underline':
3923 self._out += '<u>'
3924 unclosed_elements.append('u')
3925 if font:
3926 self._out += '<font' + font + '>'
3927 unclosed_elements.append('font')
3928 applied_style = {}
3929 if self._applied_styles:
3930 applied_style.update(self._applied_styles[-1])
3931 applied_style.update(style)
3932 self._applied_styles.append(applied_style)
3933 self._unclosed_elements.append(unclosed_elements)
3934
3935 def end(self, tag):
3936 if tag not in (_x('ttml:br'), 'br'):
3937 unclosed_elements = self._unclosed_elements.pop()
3938 for element in reversed(unclosed_elements):
3939 self._out += '</%s>' % element
3940 if unclosed_elements and self._applied_styles:
3941 self._applied_styles.pop()
3942
3943 def data(self, data):
3944 self._out += data
3945
3946 def close(self):
3947 return self._out.strip()
3948
3949 def parse_node(node):
3950 target = TTMLPElementParser()
3951 parser = xml.etree.ElementTree.XMLParser(target=target)
3952 parser.feed(xml.etree.ElementTree.tostring(node))
3953 return parser.close()
3954
3955 for k, v in LEGACY_NAMESPACES:
3956 for ns in v:
3957 dfxp_data = dfxp_data.replace(ns, k)
3958
3959 dfxp = compat_etree_fromstring(dfxp_data)
3960 out = []
3961 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3962
3963 if not paras:
3964 raise ValueError('Invalid dfxp/TTML subtitle')
3965
3966 repeat = False
3967 while True:
3968 for style in dfxp.findall(_x('.//ttml:style')):
3969 style_id = style.get('id') or style.get(_x('xml:id'))
3970 if not style_id:
3971 continue
3972 parent_style_id = style.get('style')
3973 if parent_style_id:
3974 if parent_style_id not in styles:
3975 repeat = True
3976 continue
3977 styles[style_id] = styles[parent_style_id].copy()
3978 for prop in SUPPORTED_STYLING:
3979 prop_val = style.get(_x('tts:' + prop))
3980 if prop_val:
3981 styles.setdefault(style_id, {})[prop] = prop_val
3982 if repeat:
3983 repeat = False
3984 else:
3985 break
3986
3987 for p in ('body', 'div'):
3988 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3989 if ele is None:
3990 continue
3991 style = styles.get(ele.get('style'))
3992 if not style:
3993 continue
3994 default_style.update(style)
3995
3996 for para, index in zip(paras, itertools.count(1)):
3997 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3998 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3999 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4000 if begin_time is None:
4001 continue
4002 if not end_time:
4003 if not dur:
4004 continue
4005 end_time = begin_time + dur
4006 out.append('%d\n%s --> %s\n%s\n\n' % (
4007 index,
4008 srt_subtitles_timecode(begin_time),
4009 srt_subtitles_timecode(end_time),
4010 parse_node(para)))
4011
4012 return ''.join(out)
4013
4014
4015 def cli_option(params, command_option, param, separator=None):
4016 param = params.get(param)
4017 return ([] if param is None
4018 else [command_option, str(param)] if separator is None
4019 else [f'{command_option}{separator}{param}'])
4020
4021
4022 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4023 param = params.get(param)
4024 assert param in (True, False, None)
4025 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4026
4027
4028 def cli_valueless_option(params, command_option, param, expected_value=True):
4029 return [command_option] if params.get(param) == expected_value else []
4030
4031
4032 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4033 if isinstance(argdict, (list, tuple)): # for backward compatibility
4034 if use_compat:
4035 return argdict
4036 else:
4037 argdict = None
4038 if argdict is None:
4039 return default
4040 assert isinstance(argdict, dict)
4041
4042 assert isinstance(keys, (list, tuple))
4043 for key_list in keys:
4044 arg_list = list(filter(
4045 lambda x: x is not None,
4046 [argdict.get(key.lower()) for key in variadic(key_list)]))
4047 if arg_list:
4048 return [arg for args in arg_list for arg in args]
4049 return default
4050
4051
4052 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4053 main_key, exe = main_key.lower(), exe.lower()
4054 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4055 keys = [f'{root_key}{k}' for k in (keys or [''])]
4056 if root_key in keys:
4057 if main_key != exe:
4058 keys.append((main_key, exe))
4059 keys.append('default')
4060 else:
4061 use_compat = False
4062 return cli_configuration_args(argdict, keys, default, use_compat)
4063
4064
4065 class ISO639Utils:
4066 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4067 _lang_map = {
4068 'aa': 'aar',
4069 'ab': 'abk',
4070 'ae': 'ave',
4071 'af': 'afr',
4072 'ak': 'aka',
4073 'am': 'amh',
4074 'an': 'arg',
4075 'ar': 'ara',
4076 'as': 'asm',
4077 'av': 'ava',
4078 'ay': 'aym',
4079 'az': 'aze',
4080 'ba': 'bak',
4081 'be': 'bel',
4082 'bg': 'bul',
4083 'bh': 'bih',
4084 'bi': 'bis',
4085 'bm': 'bam',
4086 'bn': 'ben',
4087 'bo': 'bod',
4088 'br': 'bre',
4089 'bs': 'bos',
4090 'ca': 'cat',
4091 'ce': 'che',
4092 'ch': 'cha',
4093 'co': 'cos',
4094 'cr': 'cre',
4095 'cs': 'ces',
4096 'cu': 'chu',
4097 'cv': 'chv',
4098 'cy': 'cym',
4099 'da': 'dan',
4100 'de': 'deu',
4101 'dv': 'div',
4102 'dz': 'dzo',
4103 'ee': 'ewe',
4104 'el': 'ell',
4105 'en': 'eng',
4106 'eo': 'epo',
4107 'es': 'spa',
4108 'et': 'est',
4109 'eu': 'eus',
4110 'fa': 'fas',
4111 'ff': 'ful',
4112 'fi': 'fin',
4113 'fj': 'fij',
4114 'fo': 'fao',
4115 'fr': 'fra',
4116 'fy': 'fry',
4117 'ga': 'gle',
4118 'gd': 'gla',
4119 'gl': 'glg',
4120 'gn': 'grn',
4121 'gu': 'guj',
4122 'gv': 'glv',
4123 'ha': 'hau',
4124 'he': 'heb',
4125 'iw': 'heb', # Replaced by he in 1989 revision
4126 'hi': 'hin',
4127 'ho': 'hmo',
4128 'hr': 'hrv',
4129 'ht': 'hat',
4130 'hu': 'hun',
4131 'hy': 'hye',
4132 'hz': 'her',
4133 'ia': 'ina',
4134 'id': 'ind',
4135 'in': 'ind', # Replaced by id in 1989 revision
4136 'ie': 'ile',
4137 'ig': 'ibo',
4138 'ii': 'iii',
4139 'ik': 'ipk',
4140 'io': 'ido',
4141 'is': 'isl',
4142 'it': 'ita',
4143 'iu': 'iku',
4144 'ja': 'jpn',
4145 'jv': 'jav',
4146 'ka': 'kat',
4147 'kg': 'kon',
4148 'ki': 'kik',
4149 'kj': 'kua',
4150 'kk': 'kaz',
4151 'kl': 'kal',
4152 'km': 'khm',
4153 'kn': 'kan',
4154 'ko': 'kor',
4155 'kr': 'kau',
4156 'ks': 'kas',
4157 'ku': 'kur',
4158 'kv': 'kom',
4159 'kw': 'cor',
4160 'ky': 'kir',
4161 'la': 'lat',
4162 'lb': 'ltz',
4163 'lg': 'lug',
4164 'li': 'lim',
4165 'ln': 'lin',
4166 'lo': 'lao',
4167 'lt': 'lit',
4168 'lu': 'lub',
4169 'lv': 'lav',
4170 'mg': 'mlg',
4171 'mh': 'mah',
4172 'mi': 'mri',
4173 'mk': 'mkd',
4174 'ml': 'mal',
4175 'mn': 'mon',
4176 'mr': 'mar',
4177 'ms': 'msa',
4178 'mt': 'mlt',
4179 'my': 'mya',
4180 'na': 'nau',
4181 'nb': 'nob',
4182 'nd': 'nde',
4183 'ne': 'nep',
4184 'ng': 'ndo',
4185 'nl': 'nld',
4186 'nn': 'nno',
4187 'no': 'nor',
4188 'nr': 'nbl',
4189 'nv': 'nav',
4190 'ny': 'nya',
4191 'oc': 'oci',
4192 'oj': 'oji',
4193 'om': 'orm',
4194 'or': 'ori',
4195 'os': 'oss',
4196 'pa': 'pan',
4197 'pi': 'pli',
4198 'pl': 'pol',
4199 'ps': 'pus',
4200 'pt': 'por',
4201 'qu': 'que',
4202 'rm': 'roh',
4203 'rn': 'run',
4204 'ro': 'ron',
4205 'ru': 'rus',
4206 'rw': 'kin',
4207 'sa': 'san',
4208 'sc': 'srd',
4209 'sd': 'snd',
4210 'se': 'sme',
4211 'sg': 'sag',
4212 'si': 'sin',
4213 'sk': 'slk',
4214 'sl': 'slv',
4215 'sm': 'smo',
4216 'sn': 'sna',
4217 'so': 'som',
4218 'sq': 'sqi',
4219 'sr': 'srp',
4220 'ss': 'ssw',
4221 'st': 'sot',
4222 'su': 'sun',
4223 'sv': 'swe',
4224 'sw': 'swa',
4225 'ta': 'tam',
4226 'te': 'tel',
4227 'tg': 'tgk',
4228 'th': 'tha',
4229 'ti': 'tir',
4230 'tk': 'tuk',
4231 'tl': 'tgl',
4232 'tn': 'tsn',
4233 'to': 'ton',
4234 'tr': 'tur',
4235 'ts': 'tso',
4236 'tt': 'tat',
4237 'tw': 'twi',
4238 'ty': 'tah',
4239 'ug': 'uig',
4240 'uk': 'ukr',
4241 'ur': 'urd',
4242 'uz': 'uzb',
4243 've': 'ven',
4244 'vi': 'vie',
4245 'vo': 'vol',
4246 'wa': 'wln',
4247 'wo': 'wol',
4248 'xh': 'xho',
4249 'yi': 'yid',
4250 'ji': 'yid', # Replaced by yi in 1989 revision
4251 'yo': 'yor',
4252 'za': 'zha',
4253 'zh': 'zho',
4254 'zu': 'zul',
4255 }
4256
4257 @classmethod
4258 def short2long(cls, code):
4259 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4260 return cls._lang_map.get(code[:2])
4261
4262 @classmethod
4263 def long2short(cls, code):
4264 """Convert language code from ISO 639-2/T to ISO 639-1"""
4265 for short_name, long_name in cls._lang_map.items():
4266 if long_name == code:
4267 return short_name
4268
4269
4270 class ISO3166Utils:
4271 # From http://data.okfn.org/data/core/country-list
4272 _country_map = {
4273 'AF': 'Afghanistan',
4274 'AX': 'Åland Islands',
4275 'AL': 'Albania',
4276 'DZ': 'Algeria',
4277 'AS': 'American Samoa',
4278 'AD': 'Andorra',
4279 'AO': 'Angola',
4280 'AI': 'Anguilla',
4281 'AQ': 'Antarctica',
4282 'AG': 'Antigua and Barbuda',
4283 'AR': 'Argentina',
4284 'AM': 'Armenia',
4285 'AW': 'Aruba',
4286 'AU': 'Australia',
4287 'AT': 'Austria',
4288 'AZ': 'Azerbaijan',
4289 'BS': 'Bahamas',
4290 'BH': 'Bahrain',
4291 'BD': 'Bangladesh',
4292 'BB': 'Barbados',
4293 'BY': 'Belarus',
4294 'BE': 'Belgium',
4295 'BZ': 'Belize',
4296 'BJ': 'Benin',
4297 'BM': 'Bermuda',
4298 'BT': 'Bhutan',
4299 'BO': 'Bolivia, Plurinational State of',
4300 'BQ': 'Bonaire, Sint Eustatius and Saba',
4301 'BA': 'Bosnia and Herzegovina',
4302 'BW': 'Botswana',
4303 'BV': 'Bouvet Island',
4304 'BR': 'Brazil',
4305 'IO': 'British Indian Ocean Territory',
4306 'BN': 'Brunei Darussalam',
4307 'BG': 'Bulgaria',
4308 'BF': 'Burkina Faso',
4309 'BI': 'Burundi',
4310 'KH': 'Cambodia',
4311 'CM': 'Cameroon',
4312 'CA': 'Canada',
4313 'CV': 'Cape Verde',
4314 'KY': 'Cayman Islands',
4315 'CF': 'Central African Republic',
4316 'TD': 'Chad',
4317 'CL': 'Chile',
4318 'CN': 'China',
4319 'CX': 'Christmas Island',
4320 'CC': 'Cocos (Keeling) Islands',
4321 'CO': 'Colombia',
4322 'KM': 'Comoros',
4323 'CG': 'Congo',
4324 'CD': 'Congo, the Democratic Republic of the',
4325 'CK': 'Cook Islands',
4326 'CR': 'Costa Rica',
4327 'CI': 'Côte d\'Ivoire',
4328 'HR': 'Croatia',
4329 'CU': 'Cuba',
4330 'CW': 'Curaçao',
4331 'CY': 'Cyprus',
4332 'CZ': 'Czech Republic',
4333 'DK': 'Denmark',
4334 'DJ': 'Djibouti',
4335 'DM': 'Dominica',
4336 'DO': 'Dominican Republic',
4337 'EC': 'Ecuador',
4338 'EG': 'Egypt',
4339 'SV': 'El Salvador',
4340 'GQ': 'Equatorial Guinea',
4341 'ER': 'Eritrea',
4342 'EE': 'Estonia',
4343 'ET': 'Ethiopia',
4344 'FK': 'Falkland Islands (Malvinas)',
4345 'FO': 'Faroe Islands',
4346 'FJ': 'Fiji',
4347 'FI': 'Finland',
4348 'FR': 'France',
4349 'GF': 'French Guiana',
4350 'PF': 'French Polynesia',
4351 'TF': 'French Southern Territories',
4352 'GA': 'Gabon',
4353 'GM': 'Gambia',
4354 'GE': 'Georgia',
4355 'DE': 'Germany',
4356 'GH': 'Ghana',
4357 'GI': 'Gibraltar',
4358 'GR': 'Greece',
4359 'GL': 'Greenland',
4360 'GD': 'Grenada',
4361 'GP': 'Guadeloupe',
4362 'GU': 'Guam',
4363 'GT': 'Guatemala',
4364 'GG': 'Guernsey',
4365 'GN': 'Guinea',
4366 'GW': 'Guinea-Bissau',
4367 'GY': 'Guyana',
4368 'HT': 'Haiti',
4369 'HM': 'Heard Island and McDonald Islands',
4370 'VA': 'Holy See (Vatican City State)',
4371 'HN': 'Honduras',
4372 'HK': 'Hong Kong',
4373 'HU': 'Hungary',
4374 'IS': 'Iceland',
4375 'IN': 'India',
4376 'ID': 'Indonesia',
4377 'IR': 'Iran, Islamic Republic of',
4378 'IQ': 'Iraq',
4379 'IE': 'Ireland',
4380 'IM': 'Isle of Man',
4381 'IL': 'Israel',
4382 'IT': 'Italy',
4383 'JM': 'Jamaica',
4384 'JP': 'Japan',
4385 'JE': 'Jersey',
4386 'JO': 'Jordan',
4387 'KZ': 'Kazakhstan',
4388 'KE': 'Kenya',
4389 'KI': 'Kiribati',
4390 'KP': 'Korea, Democratic People\'s Republic of',
4391 'KR': 'Korea, Republic of',
4392 'KW': 'Kuwait',
4393 'KG': 'Kyrgyzstan',
4394 'LA': 'Lao People\'s Democratic Republic',
4395 'LV': 'Latvia',
4396 'LB': 'Lebanon',
4397 'LS': 'Lesotho',
4398 'LR': 'Liberia',
4399 'LY': 'Libya',
4400 'LI': 'Liechtenstein',
4401 'LT': 'Lithuania',
4402 'LU': 'Luxembourg',
4403 'MO': 'Macao',
4404 'MK': 'Macedonia, the Former Yugoslav Republic of',
4405 'MG': 'Madagascar',
4406 'MW': 'Malawi',
4407 'MY': 'Malaysia',
4408 'MV': 'Maldives',
4409 'ML': 'Mali',
4410 'MT': 'Malta',
4411 'MH': 'Marshall Islands',
4412 'MQ': 'Martinique',
4413 'MR': 'Mauritania',
4414 'MU': 'Mauritius',
4415 'YT': 'Mayotte',
4416 'MX': 'Mexico',
4417 'FM': 'Micronesia, Federated States of',
4418 'MD': 'Moldova, Republic of',
4419 'MC': 'Monaco',
4420 'MN': 'Mongolia',
4421 'ME': 'Montenegro',
4422 'MS': 'Montserrat',
4423 'MA': 'Morocco',
4424 'MZ': 'Mozambique',
4425 'MM': 'Myanmar',
4426 'NA': 'Namibia',
4427 'NR': 'Nauru',
4428 'NP': 'Nepal',
4429 'NL': 'Netherlands',
4430 'NC': 'New Caledonia',
4431 'NZ': 'New Zealand',
4432 'NI': 'Nicaragua',
4433 'NE': 'Niger',
4434 'NG': 'Nigeria',
4435 'NU': 'Niue',
4436 'NF': 'Norfolk Island',
4437 'MP': 'Northern Mariana Islands',
4438 'NO': 'Norway',
4439 'OM': 'Oman',
4440 'PK': 'Pakistan',
4441 'PW': 'Palau',
4442 'PS': 'Palestine, State of',
4443 'PA': 'Panama',
4444 'PG': 'Papua New Guinea',
4445 'PY': 'Paraguay',
4446 'PE': 'Peru',
4447 'PH': 'Philippines',
4448 'PN': 'Pitcairn',
4449 'PL': 'Poland',
4450 'PT': 'Portugal',
4451 'PR': 'Puerto Rico',
4452 'QA': 'Qatar',
4453 'RE': 'Réunion',
4454 'RO': 'Romania',
4455 'RU': 'Russian Federation',
4456 'RW': 'Rwanda',
4457 'BL': 'Saint Barthélemy',
4458 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4459 'KN': 'Saint Kitts and Nevis',
4460 'LC': 'Saint Lucia',
4461 'MF': 'Saint Martin (French part)',
4462 'PM': 'Saint Pierre and Miquelon',
4463 'VC': 'Saint Vincent and the Grenadines',
4464 'WS': 'Samoa',
4465 'SM': 'San Marino',
4466 'ST': 'Sao Tome and Principe',
4467 'SA': 'Saudi Arabia',
4468 'SN': 'Senegal',
4469 'RS': 'Serbia',
4470 'SC': 'Seychelles',
4471 'SL': 'Sierra Leone',
4472 'SG': 'Singapore',
4473 'SX': 'Sint Maarten (Dutch part)',
4474 'SK': 'Slovakia',
4475 'SI': 'Slovenia',
4476 'SB': 'Solomon Islands',
4477 'SO': 'Somalia',
4478 'ZA': 'South Africa',
4479 'GS': 'South Georgia and the South Sandwich Islands',
4480 'SS': 'South Sudan',
4481 'ES': 'Spain',
4482 'LK': 'Sri Lanka',
4483 'SD': 'Sudan',
4484 'SR': 'Suriname',
4485 'SJ': 'Svalbard and Jan Mayen',
4486 'SZ': 'Swaziland',
4487 'SE': 'Sweden',
4488 'CH': 'Switzerland',
4489 'SY': 'Syrian Arab Republic',
4490 'TW': 'Taiwan, Province of China',
4491 'TJ': 'Tajikistan',
4492 'TZ': 'Tanzania, United Republic of',
4493 'TH': 'Thailand',
4494 'TL': 'Timor-Leste',
4495 'TG': 'Togo',
4496 'TK': 'Tokelau',
4497 'TO': 'Tonga',
4498 'TT': 'Trinidad and Tobago',
4499 'TN': 'Tunisia',
4500 'TR': 'Turkey',
4501 'TM': 'Turkmenistan',
4502 'TC': 'Turks and Caicos Islands',
4503 'TV': 'Tuvalu',
4504 'UG': 'Uganda',
4505 'UA': 'Ukraine',
4506 'AE': 'United Arab Emirates',
4507 'GB': 'United Kingdom',
4508 'US': 'United States',
4509 'UM': 'United States Minor Outlying Islands',
4510 'UY': 'Uruguay',
4511 'UZ': 'Uzbekistan',
4512 'VU': 'Vanuatu',
4513 'VE': 'Venezuela, Bolivarian Republic of',
4514 'VN': 'Viet Nam',
4515 'VG': 'Virgin Islands, British',
4516 'VI': 'Virgin Islands, U.S.',
4517 'WF': 'Wallis and Futuna',
4518 'EH': 'Western Sahara',
4519 'YE': 'Yemen',
4520 'ZM': 'Zambia',
4521 'ZW': 'Zimbabwe',
4522 # Not ISO 3166 codes, but used for IP blocks
4523 'AP': 'Asia/Pacific Region',
4524 'EU': 'Europe',
4525 }
4526
4527 @classmethod
4528 def short2full(cls, code):
4529 """Convert an ISO 3166-2 country code to the corresponding full name"""
4530 return cls._country_map.get(code.upper())
4531
4532
4533 class GeoUtils:
4534 # Major IPv4 address blocks per country
4535 _country_ip_map = {
4536 'AD': '46.172.224.0/19',
4537 'AE': '94.200.0.0/13',
4538 'AF': '149.54.0.0/17',
4539 'AG': '209.59.64.0/18',
4540 'AI': '204.14.248.0/21',
4541 'AL': '46.99.0.0/16',
4542 'AM': '46.70.0.0/15',
4543 'AO': '105.168.0.0/13',
4544 'AP': '182.50.184.0/21',
4545 'AQ': '23.154.160.0/24',
4546 'AR': '181.0.0.0/12',
4547 'AS': '202.70.112.0/20',
4548 'AT': '77.116.0.0/14',
4549 'AU': '1.128.0.0/11',
4550 'AW': '181.41.0.0/18',
4551 'AX': '185.217.4.0/22',
4552 'AZ': '5.197.0.0/16',
4553 'BA': '31.176.128.0/17',
4554 'BB': '65.48.128.0/17',
4555 'BD': '114.130.0.0/16',
4556 'BE': '57.0.0.0/8',
4557 'BF': '102.178.0.0/15',
4558 'BG': '95.42.0.0/15',
4559 'BH': '37.131.0.0/17',
4560 'BI': '154.117.192.0/18',
4561 'BJ': '137.255.0.0/16',
4562 'BL': '185.212.72.0/23',
4563 'BM': '196.12.64.0/18',
4564 'BN': '156.31.0.0/16',
4565 'BO': '161.56.0.0/16',
4566 'BQ': '161.0.80.0/20',
4567 'BR': '191.128.0.0/12',
4568 'BS': '24.51.64.0/18',
4569 'BT': '119.2.96.0/19',
4570 'BW': '168.167.0.0/16',
4571 'BY': '178.120.0.0/13',
4572 'BZ': '179.42.192.0/18',
4573 'CA': '99.224.0.0/11',
4574 'CD': '41.243.0.0/16',
4575 'CF': '197.242.176.0/21',
4576 'CG': '160.113.0.0/16',
4577 'CH': '85.0.0.0/13',
4578 'CI': '102.136.0.0/14',
4579 'CK': '202.65.32.0/19',
4580 'CL': '152.172.0.0/14',
4581 'CM': '102.244.0.0/14',
4582 'CN': '36.128.0.0/10',
4583 'CO': '181.240.0.0/12',
4584 'CR': '201.192.0.0/12',
4585 'CU': '152.206.0.0/15',
4586 'CV': '165.90.96.0/19',
4587 'CW': '190.88.128.0/17',
4588 'CY': '31.153.0.0/16',
4589 'CZ': '88.100.0.0/14',
4590 'DE': '53.0.0.0/8',
4591 'DJ': '197.241.0.0/17',
4592 'DK': '87.48.0.0/12',
4593 'DM': '192.243.48.0/20',
4594 'DO': '152.166.0.0/15',
4595 'DZ': '41.96.0.0/12',
4596 'EC': '186.68.0.0/15',
4597 'EE': '90.190.0.0/15',
4598 'EG': '156.160.0.0/11',
4599 'ER': '196.200.96.0/20',
4600 'ES': '88.0.0.0/11',
4601 'ET': '196.188.0.0/14',
4602 'EU': '2.16.0.0/13',
4603 'FI': '91.152.0.0/13',
4604 'FJ': '144.120.0.0/16',
4605 'FK': '80.73.208.0/21',
4606 'FM': '119.252.112.0/20',
4607 'FO': '88.85.32.0/19',
4608 'FR': '90.0.0.0/9',
4609 'GA': '41.158.0.0/15',
4610 'GB': '25.0.0.0/8',
4611 'GD': '74.122.88.0/21',
4612 'GE': '31.146.0.0/16',
4613 'GF': '161.22.64.0/18',
4614 'GG': '62.68.160.0/19',
4615 'GH': '154.160.0.0/12',
4616 'GI': '95.164.0.0/16',
4617 'GL': '88.83.0.0/19',
4618 'GM': '160.182.0.0/15',
4619 'GN': '197.149.192.0/18',
4620 'GP': '104.250.0.0/19',
4621 'GQ': '105.235.224.0/20',
4622 'GR': '94.64.0.0/13',
4623 'GT': '168.234.0.0/16',
4624 'GU': '168.123.0.0/16',
4625 'GW': '197.214.80.0/20',
4626 'GY': '181.41.64.0/18',
4627 'HK': '113.252.0.0/14',
4628 'HN': '181.210.0.0/16',
4629 'HR': '93.136.0.0/13',
4630 'HT': '148.102.128.0/17',
4631 'HU': '84.0.0.0/14',
4632 'ID': '39.192.0.0/10',
4633 'IE': '87.32.0.0/12',
4634 'IL': '79.176.0.0/13',
4635 'IM': '5.62.80.0/20',
4636 'IN': '117.192.0.0/10',
4637 'IO': '203.83.48.0/21',
4638 'IQ': '37.236.0.0/14',
4639 'IR': '2.176.0.0/12',
4640 'IS': '82.221.0.0/16',
4641 'IT': '79.0.0.0/10',
4642 'JE': '87.244.64.0/18',
4643 'JM': '72.27.0.0/17',
4644 'JO': '176.29.0.0/16',
4645 'JP': '133.0.0.0/8',
4646 'KE': '105.48.0.0/12',
4647 'KG': '158.181.128.0/17',
4648 'KH': '36.37.128.0/17',
4649 'KI': '103.25.140.0/22',
4650 'KM': '197.255.224.0/20',
4651 'KN': '198.167.192.0/19',
4652 'KP': '175.45.176.0/22',
4653 'KR': '175.192.0.0/10',
4654 'KW': '37.36.0.0/14',
4655 'KY': '64.96.0.0/15',
4656 'KZ': '2.72.0.0/13',
4657 'LA': '115.84.64.0/18',
4658 'LB': '178.135.0.0/16',
4659 'LC': '24.92.144.0/20',
4660 'LI': '82.117.0.0/19',
4661 'LK': '112.134.0.0/15',
4662 'LR': '102.183.0.0/16',
4663 'LS': '129.232.0.0/17',
4664 'LT': '78.56.0.0/13',
4665 'LU': '188.42.0.0/16',
4666 'LV': '46.109.0.0/16',
4667 'LY': '41.252.0.0/14',
4668 'MA': '105.128.0.0/11',
4669 'MC': '88.209.64.0/18',
4670 'MD': '37.246.0.0/16',
4671 'ME': '178.175.0.0/17',
4672 'MF': '74.112.232.0/21',
4673 'MG': '154.126.0.0/17',
4674 'MH': '117.103.88.0/21',
4675 'MK': '77.28.0.0/15',
4676 'ML': '154.118.128.0/18',
4677 'MM': '37.111.0.0/17',
4678 'MN': '49.0.128.0/17',
4679 'MO': '60.246.0.0/16',
4680 'MP': '202.88.64.0/20',
4681 'MQ': '109.203.224.0/19',
4682 'MR': '41.188.64.0/18',
4683 'MS': '208.90.112.0/22',
4684 'MT': '46.11.0.0/16',
4685 'MU': '105.16.0.0/12',
4686 'MV': '27.114.128.0/18',
4687 'MW': '102.70.0.0/15',
4688 'MX': '187.192.0.0/11',
4689 'MY': '175.136.0.0/13',
4690 'MZ': '197.218.0.0/15',
4691 'NA': '41.182.0.0/16',
4692 'NC': '101.101.0.0/18',
4693 'NE': '197.214.0.0/18',
4694 'NF': '203.17.240.0/22',
4695 'NG': '105.112.0.0/12',
4696 'NI': '186.76.0.0/15',
4697 'NL': '145.96.0.0/11',
4698 'NO': '84.208.0.0/13',
4699 'NP': '36.252.0.0/15',
4700 'NR': '203.98.224.0/19',
4701 'NU': '49.156.48.0/22',
4702 'NZ': '49.224.0.0/14',
4703 'OM': '5.36.0.0/15',
4704 'PA': '186.72.0.0/15',
4705 'PE': '186.160.0.0/14',
4706 'PF': '123.50.64.0/18',
4707 'PG': '124.240.192.0/19',
4708 'PH': '49.144.0.0/13',
4709 'PK': '39.32.0.0/11',
4710 'PL': '83.0.0.0/11',
4711 'PM': '70.36.0.0/20',
4712 'PR': '66.50.0.0/16',
4713 'PS': '188.161.0.0/16',
4714 'PT': '85.240.0.0/13',
4715 'PW': '202.124.224.0/20',
4716 'PY': '181.120.0.0/14',
4717 'QA': '37.210.0.0/15',
4718 'RE': '102.35.0.0/16',
4719 'RO': '79.112.0.0/13',
4720 'RS': '93.86.0.0/15',
4721 'RU': '5.136.0.0/13',
4722 'RW': '41.186.0.0/16',
4723 'SA': '188.48.0.0/13',
4724 'SB': '202.1.160.0/19',
4725 'SC': '154.192.0.0/11',
4726 'SD': '102.120.0.0/13',
4727 'SE': '78.64.0.0/12',
4728 'SG': '8.128.0.0/10',
4729 'SI': '188.196.0.0/14',
4730 'SK': '78.98.0.0/15',
4731 'SL': '102.143.0.0/17',
4732 'SM': '89.186.32.0/19',
4733 'SN': '41.82.0.0/15',
4734 'SO': '154.115.192.0/18',
4735 'SR': '186.179.128.0/17',
4736 'SS': '105.235.208.0/21',
4737 'ST': '197.159.160.0/19',
4738 'SV': '168.243.0.0/16',
4739 'SX': '190.102.0.0/20',
4740 'SY': '5.0.0.0/16',
4741 'SZ': '41.84.224.0/19',
4742 'TC': '65.255.48.0/20',
4743 'TD': '154.68.128.0/19',
4744 'TG': '196.168.0.0/14',
4745 'TH': '171.96.0.0/13',
4746 'TJ': '85.9.128.0/18',
4747 'TK': '27.96.24.0/21',
4748 'TL': '180.189.160.0/20',
4749 'TM': '95.85.96.0/19',
4750 'TN': '197.0.0.0/11',
4751 'TO': '175.176.144.0/21',
4752 'TR': '78.160.0.0/11',
4753 'TT': '186.44.0.0/15',
4754 'TV': '202.2.96.0/19',
4755 'TW': '120.96.0.0/11',
4756 'TZ': '156.156.0.0/14',
4757 'UA': '37.52.0.0/14',
4758 'UG': '102.80.0.0/13',
4759 'US': '6.0.0.0/8',
4760 'UY': '167.56.0.0/13',
4761 'UZ': '84.54.64.0/18',
4762 'VA': '212.77.0.0/19',
4763 'VC': '207.191.240.0/21',
4764 'VE': '186.88.0.0/13',
4765 'VG': '66.81.192.0/20',
4766 'VI': '146.226.0.0/16',
4767 'VN': '14.160.0.0/11',
4768 'VU': '202.80.32.0/20',
4769 'WF': '117.20.32.0/21',
4770 'WS': '202.4.32.0/19',
4771 'YE': '134.35.0.0/16',
4772 'YT': '41.242.116.0/22',
4773 'ZA': '41.0.0.0/11',
4774 'ZM': '102.144.0.0/13',
4775 'ZW': '102.177.192.0/18',
4776 }
4777
4778 @classmethod
4779 def random_ipv4(cls, code_or_block):
4780 if len(code_or_block) == 2:
4781 block = cls._country_ip_map.get(code_or_block.upper())
4782 if not block:
4783 return None
4784 else:
4785 block = code_or_block
4786 addr, preflen = block.split('/')
4787 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4788 addr_max = addr_min | (0xffffffff >> int(preflen))
4789 return str(socket.inet_ntoa(
4790 struct.pack('!L', random.randint(addr_min, addr_max))))
4791
4792
4793 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4794 def __init__(self, proxies=None):
4795 # Set default handlers
4796 for type in ('http', 'https'):
4797 setattr(self, '%s_open' % type,
4798 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4799 meth(r, proxy, type))
4800 urllib.request.ProxyHandler.__init__(self, proxies)
4801
4802 def proxy_open(self, req, proxy, type):
4803 req_proxy = req.headers.get('Ytdl-request-proxy')
4804 if req_proxy is not None:
4805 proxy = req_proxy
4806 del req.headers['Ytdl-request-proxy']
4807
4808 if proxy == '__noproxy__':
4809 return None # No Proxy
4810 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4811 req.add_header('Ytdl-socks-proxy', proxy)
4812 # yt-dlp's http/https handlers do wrapping the socket with socks
4813 return None
4814 return urllib.request.ProxyHandler.proxy_open(
4815 self, req, proxy, type)
4816
4817
4818 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4819 # released into Public Domain
4820 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4821
4822 def long_to_bytes(n, blocksize=0):
4823 """long_to_bytes(n:long, blocksize:int) : string
4824 Convert a long integer to a byte string.
4825
4826 If optional blocksize is given and greater than zero, pad the front of the
4827 byte string with binary zeros so that the length is a multiple of
4828 blocksize.
4829 """
4830 # after much testing, this algorithm was deemed to be the fastest
4831 s = b''
4832 n = int(n)
4833 while n > 0:
4834 s = struct.pack('>I', n & 0xffffffff) + s
4835 n = n >> 32
4836 # strip off leading zeros
4837 for i in range(len(s)):
4838 if s[i] != b'\000'[0]:
4839 break
4840 else:
4841 # only happens when n == 0
4842 s = b'\000'
4843 i = 0
4844 s = s[i:]
4845 # add back some pad bytes. this could be done more efficiently w.r.t. the
4846 # de-padding being done above, but sigh...
4847 if blocksize > 0 and len(s) % blocksize:
4848 s = (blocksize - len(s) % blocksize) * b'\000' + s
4849 return s
4850
4851
4852 def bytes_to_long(s):
4853 """bytes_to_long(string) : long
4854 Convert a byte string to a long integer.
4855
4856 This is (essentially) the inverse of long_to_bytes().
4857 """
4858 acc = 0
4859 length = len(s)
4860 if length % 4:
4861 extra = (4 - length % 4)
4862 s = b'\000' * extra + s
4863 length = length + extra
4864 for i in range(0, length, 4):
4865 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4866 return acc
4867
4868
4869 def ohdave_rsa_encrypt(data, exponent, modulus):
4870 '''
4871 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4872
4873 Input:
4874 data: data to encrypt, bytes-like object
4875 exponent, modulus: parameter e and N of RSA algorithm, both integer
4876 Output: hex string of encrypted data
4877
4878 Limitation: supports one block encryption only
4879 '''
4880
4881 payload = int(binascii.hexlify(data[::-1]), 16)
4882 encrypted = pow(payload, exponent, modulus)
4883 return '%x' % encrypted
4884
4885
4886 def pkcs1pad(data, length):
4887 """
4888 Padding input data with PKCS#1 scheme
4889
4890 @param {int[]} data input data
4891 @param {int} length target length
4892 @returns {int[]} padded data
4893 """
4894 if len(data) > length - 11:
4895 raise ValueError('Input data too long for PKCS#1 padding')
4896
4897 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4898 return [0, 2] + pseudo_random + [0] + data
4899
4900
4901 def _base_n_table(n, table):
4902 if not table and not n:
4903 raise ValueError('Either table or n must be specified')
4904 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4905
4906 if n and n != len(table):
4907 raise ValueError(f'base {n} exceeds table length {len(table)}')
4908 return table
4909
4910
4911 def encode_base_n(num, n=None, table=None):
4912 """Convert given int to a base-n string"""
4913 table = _base_n_table(n, table)
4914 if not num:
4915 return table[0]
4916
4917 result, base = '', len(table)
4918 while num:
4919 result = table[num % base] + result
4920 num = num // base
4921 return result
4922
4923
4924 def decode_base_n(string, n=None, table=None):
4925 """Convert given base-n string to int"""
4926 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4927 result, base = 0, len(table)
4928 for char in string:
4929 result = result * base + table[char]
4930 return result
4931
4932
4933 def decode_base(value, digits):
4934 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4935 f'in a future version. Use {__name__}.decode_base_n instead')
4936 return decode_base_n(value, table=digits)
4937
4938
4939 def decode_packed_codes(code):
4940 mobj = re.search(PACKED_CODES_RE, code)
4941 obfuscated_code, base, count, symbols = mobj.groups()
4942 base = int(base)
4943 count = int(count)
4944 symbols = symbols.split('|')
4945 symbol_table = {}
4946
4947 while count:
4948 count -= 1
4949 base_n_count = encode_base_n(count, base)
4950 symbol_table[base_n_count] = symbols[count] or base_n_count
4951
4952 return re.sub(
4953 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4954 obfuscated_code)
4955
4956
4957 def caesar(s, alphabet, shift):
4958 if shift == 0:
4959 return s
4960 l = len(alphabet)
4961 return ''.join(
4962 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4963 for c in s)
4964
4965
4966 def rot47(s):
4967 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4968
4969
4970 def parse_m3u8_attributes(attrib):
4971 info = {}
4972 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4973 if val.startswith('"'):
4974 val = val[1:-1]
4975 info[key] = val
4976 return info
4977
4978
4979 def urshift(val, n):
4980 return val >> n if val >= 0 else (val + 0x100000000) >> n
4981
4982
4983 # Based on png2str() written by @gdkchan and improved by @yokrysty
4984 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4985 def decode_png(png_data):
4986 # Reference: https://www.w3.org/TR/PNG/
4987 header = png_data[8:]
4988
4989 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4990 raise OSError('Not a valid PNG file.')
4991
4992 int_map = {1: '>B', 2: '>H', 4: '>I'}
4993 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4994
4995 chunks = []
4996
4997 while header:
4998 length = unpack_integer(header[:4])
4999 header = header[4:]
5000
5001 chunk_type = header[:4]
5002 header = header[4:]
5003
5004 chunk_data = header[:length]
5005 header = header[length:]
5006
5007 header = header[4:] # Skip CRC
5008
5009 chunks.append({
5010 'type': chunk_type,
5011 'length': length,
5012 'data': chunk_data
5013 })
5014
5015 ihdr = chunks[0]['data']
5016
5017 width = unpack_integer(ihdr[:4])
5018 height = unpack_integer(ihdr[4:8])
5019
5020 idat = b''
5021
5022 for chunk in chunks:
5023 if chunk['type'] == b'IDAT':
5024 idat += chunk['data']
5025
5026 if not idat:
5027 raise OSError('Unable to read PNG data.')
5028
5029 decompressed_data = bytearray(zlib.decompress(idat))
5030
5031 stride = width * 3
5032 pixels = []
5033
5034 def _get_pixel(idx):
5035 x = idx % stride
5036 y = idx // stride
5037 return pixels[y][x]
5038
5039 for y in range(height):
5040 basePos = y * (1 + stride)
5041 filter_type = decompressed_data[basePos]
5042
5043 current_row = []
5044
5045 pixels.append(current_row)
5046
5047 for x in range(stride):
5048 color = decompressed_data[1 + basePos + x]
5049 basex = y * stride + x
5050 left = 0
5051 up = 0
5052
5053 if x > 2:
5054 left = _get_pixel(basex - 3)
5055 if y > 0:
5056 up = _get_pixel(basex - stride)
5057
5058 if filter_type == 1: # Sub
5059 color = (color + left) & 0xff
5060 elif filter_type == 2: # Up
5061 color = (color + up) & 0xff
5062 elif filter_type == 3: # Average
5063 color = (color + ((left + up) >> 1)) & 0xff
5064 elif filter_type == 4: # Paeth
5065 a = left
5066 b = up
5067 c = 0
5068
5069 if x > 2 and y > 0:
5070 c = _get_pixel(basex - stride - 3)
5071
5072 p = a + b - c
5073
5074 pa = abs(p - a)
5075 pb = abs(p - b)
5076 pc = abs(p - c)
5077
5078 if pa <= pb and pa <= pc:
5079 color = (color + a) & 0xff
5080 elif pb <= pc:
5081 color = (color + b) & 0xff
5082 else:
5083 color = (color + c) & 0xff
5084
5085 current_row.append(color)
5086
5087 return width, height, pixels
5088
5089
5090 def write_xattr(path, key, value):
5091 # Windows: Write xattrs to NTFS Alternate Data Streams:
5092 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5093 if compat_os_name == 'nt':
5094 assert ':' not in key
5095 assert os.path.exists(path)
5096
5097 try:
5098 with open(f'{path}:{key}', 'wb') as f:
5099 f.write(value)
5100 except OSError as e:
5101 raise XAttrMetadataError(e.errno, e.strerror)
5102 return
5103
5104 # UNIX Method 1. Use xattrs/pyxattrs modules
5105
5106 setxattr = None
5107 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5108 # Unicode arguments are not supported in pyxattr until version 0.5.0
5109 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5110 if version_tuple(xattr.__version__) >= (0, 5, 0):
5111 setxattr = xattr.set
5112 elif xattr:
5113 setxattr = xattr.setxattr
5114
5115 if setxattr:
5116 try:
5117 setxattr(path, key, value)
5118 except OSError as e:
5119 raise XAttrMetadataError(e.errno, e.strerror)
5120 return
5121
5122 # UNIX Method 2. Use setfattr/xattr executables
5123 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5124 else 'xattr' if check_executable('xattr', ['-h']) else None)
5125 if not exe:
5126 raise XAttrUnavailableError(
5127 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5128 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5129
5130 value = value.decode()
5131 try:
5132 _, stderr, returncode = Popen.run(
5133 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5134 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5135 except OSError as e:
5136 raise XAttrMetadataError(e.errno, e.strerror)
5137 if returncode:
5138 raise XAttrMetadataError(returncode, stderr)
5139
5140
5141 def random_birthday(year_field, month_field, day_field):
5142 start_date = datetime.date(1950, 1, 1)
5143 end_date = datetime.date(1995, 12, 31)
5144 offset = random.randint(0, (end_date - start_date).days)
5145 random_date = start_date + datetime.timedelta(offset)
5146 return {
5147 year_field: str(random_date.year),
5148 month_field: str(random_date.month),
5149 day_field: str(random_date.day),
5150 }
5151
5152
5153 # Templates for internet shortcut files, which are plain text files.
5154 DOT_URL_LINK_TEMPLATE = '''\
5155 [InternetShortcut]
5156 URL=%(url)s
5157 '''
5158
5159 DOT_WEBLOC_LINK_TEMPLATE = '''\
5160 <?xml version="1.0" encoding="UTF-8"?>
5161 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5162 <plist version="1.0">
5163 <dict>
5164 \t<key>URL</key>
5165 \t<string>%(url)s</string>
5166 </dict>
5167 </plist>
5168 '''
5169
5170 DOT_DESKTOP_LINK_TEMPLATE = '''\
5171 [Desktop Entry]
5172 Encoding=UTF-8
5173 Name=%(filename)s
5174 Type=Link
5175 URL=%(url)s
5176 Icon=text-html
5177 '''
5178
5179 LINK_TEMPLATES = {
5180 'url': DOT_URL_LINK_TEMPLATE,
5181 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5182 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5183 }
5184
5185
5186 def iri_to_uri(iri):
5187 """
5188 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5189
5190 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5191 """
5192
5193 iri_parts = urllib.parse.urlparse(iri)
5194
5195 if '[' in iri_parts.netloc:
5196 raise ValueError('IPv6 URIs are not, yet, supported.')
5197 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5198
5199 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5200
5201 net_location = ''
5202 if iri_parts.username:
5203 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5204 if iri_parts.password is not None:
5205 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5206 net_location += '@'
5207
5208 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5209 # The 'idna' encoding produces ASCII text.
5210 if iri_parts.port is not None and iri_parts.port != 80:
5211 net_location += ':' + str(iri_parts.port)
5212
5213 return urllib.parse.urlunparse(
5214 (iri_parts.scheme,
5215 net_location,
5216
5217 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5218
5219 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5220 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5221
5222 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5223 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5224
5225 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5226
5227 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5228
5229
5230 def to_high_limit_path(path):
5231 if sys.platform in ['win32', 'cygwin']:
5232 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5233 return '\\\\?\\' + os.path.abspath(path)
5234
5235 return path
5236
5237
5238 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5239 val = traverse_obj(obj, *variadic(field))
5240 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5241 return default
5242 return template % func(val)
5243
5244
5245 def clean_podcast_url(url):
5246 return re.sub(r'''(?x)
5247 (?:
5248 (?:
5249 chtbl\.com/track|
5250 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5251 play\.podtrac\.com
5252 )/[^/]+|
5253 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5254 flex\.acast\.com|
5255 pd(?:
5256 cn\.co| # https://podcorn.com/analytics-prefix/
5257 st\.fm # https://podsights.com/docs/
5258 )/e
5259 )/''', '', url)
5260
5261
5262 _HEX_TABLE = '0123456789abcdef'
5263
5264
5265 def random_uuidv4():
5266 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5267
5268
5269 def make_dir(path, to_screen=None):
5270 try:
5271 dn = os.path.dirname(path)
5272 if dn and not os.path.exists(dn):
5273 os.makedirs(dn)
5274 return True
5275 except OSError as err:
5276 if callable(to_screen) is not None:
5277 to_screen('unable to create directory ' + error_to_compat_str(err))
5278 return False
5279
5280
5281 def get_executable_path():
5282 from .update import _get_variant_and_executable_path
5283
5284 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5285
5286
5287 def load_plugins(name, suffix, namespace):
5288 classes = {}
5289 with contextlib.suppress(FileNotFoundError):
5290 plugins_spec = importlib.util.spec_from_file_location(
5291 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5292 plugins = importlib.util.module_from_spec(plugins_spec)
5293 sys.modules[plugins_spec.name] = plugins
5294 plugins_spec.loader.exec_module(plugins)
5295 for name in dir(plugins):
5296 if name in namespace:
5297 continue
5298 if not name.endswith(suffix):
5299 continue
5300 klass = getattr(plugins, name)
5301 classes[name] = namespace[name] = klass
5302 return classes
5303
5304
5305 def traverse_obj(
5306 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
5307 casesense=True, is_user_input=False, traverse_string=False):
5308 """
5309 Safely traverse nested `dict`s and `Sequence`s
5310
5311 >>> obj = [{}, {"key": "value"}]
5312 >>> traverse_obj(obj, (1, "key"))
5313 "value"
5314
5315 Each of the provided `paths` is tested and the first producing a valid result will be returned.
5316 The next path will also be tested if the path branched but no results could be found.
5317 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
5318 A value of None is treated as the absence of a value.
5319
5320 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5321
5322 The keys in the path can be one of:
5323 - `None`: Return the current object.
5324 - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`.
5325 - `slice`: Branch out and return all values in `obj[key]`.
5326 - `Ellipsis`: Branch out and return a list of all values.
5327 - `tuple`/`list`: Branch out and return a list of all matching values.
5328 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5329 - `function`: Branch out and return values filtered by the function.
5330 Read as: `[value for key, value in obj if function(key, value)]`.
5331 For `Sequence`s, `key` is the index of the value.
5332 - `dict` Transform the current object and return a matching dict.
5333 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5334
5335 `tuple`, `list`, and `dict` all support nested paths and branches.
5336
5337 @params paths Paths which to traverse by.
5338 @param default Value to return if the paths do not match.
5339 @param expected_type If a `type`, only accept final values of this type.
5340 If any other callable, try to call the function on each result.
5341 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5342 @param casesense If `False`, consider string dictionary keys as case insensitive.
5343
5344 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5345
5346 @param is_user_input Whether the keys are generated from user input.
5347 If `True` strings get converted to `int`/`slice` if needed.
5348 @param traverse_string Whether to traverse into objects as strings.
5349 If `True`, any non-compatible object will first be
5350 converted into a string and then traversed into.
5351
5352
5353 @returns The result of the object traversal.
5354 If successful, `get_all=True`, and the path branches at least once,
5355 then a list of results is returned instead.
5356 A list is always returned if the last path branches and no `default` is given.
5357 """
5358 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5359 casefold = lambda k: k.casefold() if isinstance(k, str) else k
5360
5361 if isinstance(expected_type, type):
5362 type_test = lambda val: val if isinstance(val, expected_type) else None
5363 else:
5364 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5365
5366 def apply_key(key, obj):
5367 if obj is None:
5368 return
5369
5370 elif key is None:
5371 yield obj
5372
5373 elif isinstance(key, (list, tuple)):
5374 for branch in key:
5375 _, result = apply_path(obj, branch)
5376 yield from result
5377
5378 elif key is ...:
5379 if isinstance(obj, collections.abc.Mapping):
5380 yield from obj.values()
5381 elif is_sequence(obj):
5382 yield from obj
5383 elif isinstance(obj, re.Match):
5384 yield from obj.groups()
5385 elif traverse_string:
5386 yield from str(obj)
5387
5388 elif callable(key):
5389 if is_sequence(obj):
5390 iter_obj = enumerate(obj)
5391 elif isinstance(obj, collections.abc.Mapping):
5392 iter_obj = obj.items()
5393 elif isinstance(obj, re.Match):
5394 iter_obj = enumerate((obj.group(), *obj.groups()))
5395 elif traverse_string:
5396 iter_obj = enumerate(str(obj))
5397 else:
5398 return
5399 yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5400
5401 elif isinstance(key, dict):
5402 iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5403 yield {k: v if v is not None else default for k, v in iter_obj
5404 if v is not None or default is not NO_DEFAULT}
5405
5406 elif isinstance(obj, collections.abc.Mapping):
5407 yield (obj.get(key) if casesense or (key in obj)
5408 else next((v for k, v in obj.items() if casefold(k) == key), None))
5409
5410 elif isinstance(obj, re.Match):
5411 if isinstance(key, int) or casesense:
5412 with contextlib.suppress(IndexError):
5413 yield obj.group(key)
5414 return
5415
5416 if not isinstance(key, str):
5417 return
5418
5419 yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5420
5421 else:
5422 if is_user_input:
5423 key = (int_or_none(key) if ':' not in key
5424 else slice(*map(int_or_none, key.split(':'))))
5425
5426 if not isinstance(key, (int, slice)):
5427 return
5428
5429 if not is_sequence(obj):
5430 if not traverse_string:
5431 return
5432 obj = str(obj)
5433
5434 with contextlib.suppress(IndexError):
5435 yield obj[key]
5436
5437 def apply_path(start_obj, path):
5438 objs = (start_obj,)
5439 has_branched = False
5440
5441 for key in variadic(path):
5442 if is_user_input and key == ':':
5443 key = ...
5444
5445 if not casesense and isinstance(key, str):
5446 key = key.casefold()
5447
5448 if key is ... or isinstance(key, (list, tuple)) or callable(key):
5449 has_branched = True
5450
5451 key_func = functools.partial(apply_key, key)
5452 objs = itertools.chain.from_iterable(map(key_func, objs))
5453
5454 return has_branched, objs
5455
5456 def _traverse_obj(obj, path, use_list=True):
5457 has_branched, results = apply_path(obj, path)
5458 results = LazyList(x for x in map(type_test, results) if x is not None)
5459
5460 if get_all and has_branched:
5461 return results.exhaust() if results or use_list else None
5462
5463 return results[0] if results else None
5464
5465 for index, path in enumerate(paths, 1):
5466 use_list = default is NO_DEFAULT and index == len(paths)
5467 result = _traverse_obj(obj, path, use_list)
5468 if result is not None:
5469 return result
5470
5471 return None if default is NO_DEFAULT else default
5472
5473
5474 def traverse_dict(dictn, keys, casesense=True):
5475 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5476 f'in a future version. Use "{__name__}.traverse_obj" instead')
5477 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5478
5479
5480 def get_first(obj, keys, **kwargs):
5481 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5482
5483
5484 def time_seconds(**kwargs):
5485 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5486 return t.timestamp()
5487
5488
5489 # create a JSON Web Signature (jws) with HS256 algorithm
5490 # the resulting format is in JWS Compact Serialization
5491 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5492 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5493 def jwt_encode_hs256(payload_data, key, headers={}):
5494 header_data = {
5495 'alg': 'HS256',
5496 'typ': 'JWT',
5497 }
5498 if headers:
5499 header_data.update(headers)
5500 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5501 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5502 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5503 signature_b64 = base64.b64encode(h.digest())
5504 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5505 return token
5506
5507
5508 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5509 def jwt_decode_hs256(jwt):
5510 header_b64, payload_b64, signature_b64 = jwt.split('.')
5511 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5512 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5513 return payload_data
5514
5515
5516 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5517
5518
5519 @functools.cache
5520 def supports_terminal_sequences(stream):
5521 if compat_os_name == 'nt':
5522 if not WINDOWS_VT_MODE:
5523 return False
5524 elif not os.getenv('TERM'):
5525 return False
5526 try:
5527 return stream.isatty()
5528 except BaseException:
5529 return False
5530
5531
5532 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5533 if get_windows_version() < (10, 0, 10586):
5534 return
5535 global WINDOWS_VT_MODE
5536 try:
5537 Popen.run('', shell=True)
5538 except Exception:
5539 return
5540
5541 WINDOWS_VT_MODE = True
5542 supports_terminal_sequences.cache_clear()
5543
5544
5545 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5546
5547
5548 def remove_terminal_sequences(string):
5549 return _terminal_sequences_re.sub('', string)
5550
5551
5552 def number_of_digits(number):
5553 return len('%d' % number)
5554
5555
5556 def join_nonempty(*values, delim='-', from_dict=None):
5557 if from_dict is not None:
5558 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5559 return delim.join(map(str, filter(None, values)))
5560
5561
5562 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5563 """
5564 Find the largest format dimensions in terms of video width and, for each thumbnail:
5565 * Modify the URL: Match the width with the provided regex and replace with the former width
5566 * Update dimensions
5567
5568 This function is useful with video services that scale the provided thumbnails on demand
5569 """
5570 _keys = ('width', 'height')
5571 max_dimensions = max(
5572 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5573 default=(0, 0))
5574 if not max_dimensions[0]:
5575 return thumbnails
5576 return [
5577 merge_dicts(
5578 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5579 dict(zip(_keys, max_dimensions)), thumbnail)
5580 for thumbnail in thumbnails
5581 ]
5582
5583
5584 def parse_http_range(range):
5585 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5586 if not range:
5587 return None, None, None
5588 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5589 if not crg:
5590 return None, None, None
5591 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5592
5593
5594 def read_stdin(what):
5595 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5596 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5597 return sys.stdin
5598
5599
5600 def determine_file_encoding(data):
5601 """
5602 Detect the text encoding used
5603 @returns (encoding, bytes to skip)
5604 """
5605
5606 # BOM marks are given priority over declarations
5607 for bom, enc in BOMS:
5608 if data.startswith(bom):
5609 return enc, len(bom)
5610
5611 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5612 # We ignore the endianness to get a good enough match
5613 data = data.replace(b'\0', b'')
5614 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5615 return mobj.group(1).decode() if mobj else None, 0
5616
5617
5618 class Config:
5619 own_args = None
5620 parsed_args = None
5621 filename = None
5622 __initialized = False
5623
5624 def __init__(self, parser, label=None):
5625 self.parser, self.label = parser, label
5626 self._loaded_paths, self.configs = set(), []
5627
5628 def init(self, args=None, filename=None):
5629 assert not self.__initialized
5630 self.own_args, self.filename = args, filename
5631 return self.load_configs()
5632
5633 def load_configs(self):
5634 directory = ''
5635 if self.filename:
5636 location = os.path.realpath(self.filename)
5637 directory = os.path.dirname(location)
5638 if location in self._loaded_paths:
5639 return False
5640 self._loaded_paths.add(location)
5641
5642 self.__initialized = True
5643 opts, _ = self.parser.parse_known_args(self.own_args)
5644 self.parsed_args = self.own_args
5645 for location in opts.config_locations or []:
5646 if location == '-':
5647 if location in self._loaded_paths:
5648 continue
5649 self._loaded_paths.add(location)
5650 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5651 continue
5652 location = os.path.join(directory, expand_path(location))
5653 if os.path.isdir(location):
5654 location = os.path.join(location, 'yt-dlp.conf')
5655 if not os.path.exists(location):
5656 self.parser.error(f'config location {location} does not exist')
5657 self.append_config(self.read_file(location), location)
5658 return True
5659
5660 def __str__(self):
5661 label = join_nonempty(
5662 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5663 delim=' ')
5664 return join_nonempty(
5665 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5666 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5667 delim='\n')
5668
5669 @staticmethod
5670 def read_file(filename, default=[]):
5671 try:
5672 optionf = open(filename, 'rb')
5673 except OSError:
5674 return default # silently skip if file is not present
5675 try:
5676 enc, skip = determine_file_encoding(optionf.read(512))
5677 optionf.seek(skip, io.SEEK_SET)
5678 except OSError:
5679 enc = None # silently skip read errors
5680 try:
5681 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5682 contents = optionf.read().decode(enc or preferredencoding())
5683 res = shlex.split(contents, comments=True)
5684 except Exception as err:
5685 raise ValueError(f'Unable to parse "{filename}": {err}')
5686 finally:
5687 optionf.close()
5688 return res
5689
5690 @staticmethod
5691 def hide_login_info(opts):
5692 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5693 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5694
5695 def _scrub_eq(o):
5696 m = eqre.match(o)
5697 if m:
5698 return m.group('key') + '=PRIVATE'
5699 else:
5700 return o
5701
5702 opts = list(map(_scrub_eq, opts))
5703 for idx, opt in enumerate(opts):
5704 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5705 opts[idx + 1] = 'PRIVATE'
5706 return opts
5707
5708 def append_config(self, *args, label=None):
5709 config = type(self)(self.parser, label)
5710 config._loaded_paths = self._loaded_paths
5711 if config.init(*args):
5712 self.configs.append(config)
5713
5714 @property
5715 def all_args(self):
5716 for config in reversed(self.configs):
5717 yield from config.all_args
5718 yield from self.parsed_args or []
5719
5720 def parse_known_args(self, **kwargs):
5721 return self.parser.parse_known_args(self.all_args, **kwargs)
5722
5723 def parse_args(self):
5724 return self.parser.parse_args(self.all_args)
5725
5726
5727 class WebSocketsWrapper:
5728 """Wraps websockets module to use in non-async scopes"""
5729 pool = None
5730
5731 def __init__(self, url, headers=None, connect=True):
5732 self.loop = asyncio.new_event_loop()
5733 # XXX: "loop" is deprecated
5734 self.conn = websockets.connect(
5735 url, extra_headers=headers, ping_interval=None,
5736 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5737 if connect:
5738 self.__enter__()
5739 atexit.register(self.__exit__, None, None, None)
5740
5741 def __enter__(self):
5742 if not self.pool:
5743 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5744 return self
5745
5746 def send(self, *args):
5747 self.run_with_loop(self.pool.send(*args), self.loop)
5748
5749 def recv(self, *args):
5750 return self.run_with_loop(self.pool.recv(*args), self.loop)
5751
5752 def __exit__(self, type, value, traceback):
5753 try:
5754 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5755 finally:
5756 self.loop.close()
5757 self._cancel_all_tasks(self.loop)
5758
5759 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5760 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5761 @staticmethod
5762 def run_with_loop(main, loop):
5763 if not asyncio.iscoroutine(main):
5764 raise ValueError(f'a coroutine was expected, got {main!r}')
5765
5766 try:
5767 return loop.run_until_complete(main)
5768 finally:
5769 loop.run_until_complete(loop.shutdown_asyncgens())
5770 if hasattr(loop, 'shutdown_default_executor'):
5771 loop.run_until_complete(loop.shutdown_default_executor())
5772
5773 @staticmethod
5774 def _cancel_all_tasks(loop):
5775 to_cancel = asyncio.all_tasks(loop)
5776
5777 if not to_cancel:
5778 return
5779
5780 for task in to_cancel:
5781 task.cancel()
5782
5783 # XXX: "loop" is removed in python 3.10+
5784 loop.run_until_complete(
5785 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5786
5787 for task in to_cancel:
5788 if task.cancelled():
5789 continue
5790 if task.exception() is not None:
5791 loop.call_exception_handler({
5792 'message': 'unhandled exception during asyncio.run() shutdown',
5793 'exception': task.exception(),
5794 'task': task,
5795 })
5796
5797
5798 def merge_headers(*dicts):
5799 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5800 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5801
5802
5803 def cached_method(f):
5804 """Cache a method"""
5805 signature = inspect.signature(f)
5806
5807 @functools.wraps(f)
5808 def wrapper(self, *args, **kwargs):
5809 bound_args = signature.bind(self, *args, **kwargs)
5810 bound_args.apply_defaults()
5811 key = tuple(bound_args.arguments.values())[1:]
5812
5813 cache = vars(self).setdefault('__cached_method__cache', {}).setdefault(f.__name__, {})
5814 if key not in cache:
5815 cache[key] = f(self, *args, **kwargs)
5816 return cache[key]
5817 return wrapper
5818
5819
5820 class classproperty:
5821 """property access for class methods"""
5822
5823 def __init__(self, func):
5824 functools.update_wrapper(self, func)
5825 self.func = func
5826
5827 def __get__(self, _, cls):
5828 return self.func(cls)
5829
5830
5831 class Namespace(types.SimpleNamespace):
5832 """Immutable namespace"""
5833
5834 def __iter__(self):
5835 return iter(self.__dict__.values())
5836
5837 @property
5838 def items_(self):
5839 return self.__dict__.items()
5840
5841
5842 MEDIA_EXTENSIONS = Namespace(
5843 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5844 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5845 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5846 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5847 thumbnails=('jpg', 'png', 'webp'),
5848 storyboards=('mhtml', ),
5849 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5850 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5851 )
5852 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5853 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5854
5855 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5856
5857
5858 class RetryManager:
5859 """Usage:
5860 for retry in RetryManager(...):
5861 try:
5862 ...
5863 except SomeException as err:
5864 retry.error = err
5865 continue
5866 """
5867 attempt, _error = 0, None
5868
5869 def __init__(self, _retries, _error_callback, **kwargs):
5870 self.retries = _retries or 0
5871 self.error_callback = functools.partial(_error_callback, **kwargs)
5872
5873 def _should_retry(self):
5874 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5875
5876 @property
5877 def error(self):
5878 if self._error is NO_DEFAULT:
5879 return None
5880 return self._error
5881
5882 @error.setter
5883 def error(self, value):
5884 self._error = value
5885
5886 def __iter__(self):
5887 while self._should_retry():
5888 self.error = NO_DEFAULT
5889 self.attempt += 1
5890 yield self
5891 if self.error:
5892 self.error_callback(self.error, self.attempt, self.retries)
5893
5894 @staticmethod
5895 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5896 """Utility function for reporting retries"""
5897 if count > retries:
5898 if error:
5899 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5900 raise e
5901
5902 if not count:
5903 return warn(e)
5904 elif isinstance(e, ExtractorError):
5905 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5906 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5907
5908 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5909 if delay:
5910 info(f'Sleeping {delay:.2f} seconds ...')
5911 time.sleep(delay)
5912
5913
5914 def make_archive_id(ie, video_id):
5915 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5916 return f'{ie_key.lower()} {video_id}'
5917
5918
5919 def truncate_string(s, left, right=0):
5920 assert left > 3 and right >= 0
5921 if s is None or len(s) <= left + right:
5922 return s
5923 return f'{s[:left-3]}...{s[-right:]}'
5924
5925
5926 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5927 assert 'all' in alias_dict, '"all" alias is required'
5928 requested = list(start or [])
5929 for val in options:
5930 discard = val.startswith('-')
5931 if discard:
5932 val = val[1:]
5933
5934 if val in alias_dict:
5935 val = alias_dict[val] if not discard else [
5936 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5937 # NB: Do not allow regex in aliases for performance
5938 requested = orderedSet_from_options(val, alias_dict, start=requested)
5939 continue
5940
5941 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5942 else [val] if val in alias_dict['all'] else None)
5943 if current is None:
5944 raise ValueError(val)
5945
5946 if discard:
5947 for item in current:
5948 while item in requested:
5949 requested.remove(item)
5950 else:
5951 requested.extend(current)
5952
5953 return orderedSet(requested)
5954
5955
5956 # Deprecated
5957 has_certifi = bool(certifi)
5958 has_websockets = bool(websockets)