]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[docs] Consistent use of `e.g.` (#4643)
[yt-dlp.git] / yt_dlp / utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import datetime
10 import email.header
11 import email.utils
12 import errno
13 import gzip
14 import hashlib
15 import hmac
16 import html.entities
17 import html.parser
18 import http.client
19 import http.cookiejar
20 import importlib.util
21 import inspect
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import mimetypes
28 import operator
29 import os
30 import platform
31 import random
32 import re
33 import shlex
34 import socket
35 import ssl
36 import struct
37 import subprocess
38 import sys
39 import tempfile
40 import time
41 import traceback
42 import types
43 import unicodedata
44 import urllib.error
45 import urllib.parse
46 import urllib.request
47 import xml.etree.ElementTree
48 import zlib
49
50 from .compat import functools # isort: split
51 from .compat import (
52 compat_etree_fromstring,
53 compat_expanduser,
54 compat_HTMLParseError,
55 compat_os_name,
56 compat_shlex_quote,
57 )
58 from .dependencies import brotli, certifi, websockets, xattr
59 from .socks import ProxyType, sockssocket
60
61
62 def register_socks_protocols():
63 # "Register" SOCKS protocols
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
69
70
71 # This is not clearly defined otherwise
72 compiled_regex_type = type(re.compile(''))
73
74
75 def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
120 SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122 ]
123 if brotli:
124 SUPPORTED_ENCODINGS.append('br')
125
126 std_headers = {
127 'User-Agent': random_user_agent(),
128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
129 'Accept-Language': 'en-us,en;q=0.5',
130 'Sec-Fetch-Mode': 'navigate',
131 }
132
133
134 USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136 }
137
138
139 NO_DEFAULT = object()
140 IDENTITY = lambda x: x
141
142 ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
146 MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
151 }
152
153 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
154 TIMEZONE_NAMES = {
155 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
156 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
157 'EST': -5, 'EDT': -4, # Eastern
158 'CST': -6, 'CDT': -5, # Central
159 'MST': -7, 'MDT': -6, # Mountain
160 'PST': -8, 'PDT': -7 # Pacific
161 }
162
163 # needed for sanitizing filenames in restricted mode
164 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
165 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
166 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
167
168 DATE_FORMATS = (
169 '%d %B %Y',
170 '%d %b %Y',
171 '%B %d %Y',
172 '%B %dst %Y',
173 '%B %dnd %Y',
174 '%B %drd %Y',
175 '%B %dth %Y',
176 '%b %d %Y',
177 '%b %dst %Y',
178 '%b %dnd %Y',
179 '%b %drd %Y',
180 '%b %dth %Y',
181 '%b %dst %Y %I:%M',
182 '%b %dnd %Y %I:%M',
183 '%b %drd %Y %I:%M',
184 '%b %dth %Y %I:%M',
185 '%Y %m %d',
186 '%Y-%m-%d',
187 '%Y.%m.%d.',
188 '%Y/%m/%d',
189 '%Y/%m/%d %H:%M',
190 '%Y/%m/%d %H:%M:%S',
191 '%Y%m%d%H%M',
192 '%Y%m%d%H%M%S',
193 '%Y%m%d',
194 '%Y-%m-%d %H:%M',
195 '%Y-%m-%d %H:%M:%S',
196 '%Y-%m-%d %H:%M:%S.%f',
197 '%Y-%m-%d %H:%M:%S:%f',
198 '%d.%m.%Y %H:%M',
199 '%d.%m.%Y %H.%M',
200 '%Y-%m-%dT%H:%M:%SZ',
201 '%Y-%m-%dT%H:%M:%S.%fZ',
202 '%Y-%m-%dT%H:%M:%S.%f0Z',
203 '%Y-%m-%dT%H:%M:%S',
204 '%Y-%m-%dT%H:%M:%S.%f',
205 '%Y-%m-%dT%H:%M',
206 '%b %d %Y at %H:%M',
207 '%b %d %Y at %H:%M:%S',
208 '%B %d %Y at %H:%M',
209 '%B %d %Y at %H:%M:%S',
210 '%H:%M %d-%b-%Y',
211 )
212
213 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
214 DATE_FORMATS_DAY_FIRST.extend([
215 '%d-%m-%Y',
216 '%d.%m.%Y',
217 '%d.%m.%y',
218 '%d/%m/%Y',
219 '%d/%m/%y',
220 '%d/%m/%Y %H:%M:%S',
221 '%d-%m-%Y %H:%M',
222 ])
223
224 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
225 DATE_FORMATS_MONTH_FIRST.extend([
226 '%m-%d-%Y',
227 '%m.%d.%Y',
228 '%m/%d/%Y',
229 '%m/%d/%y',
230 '%m/%d/%Y %H:%M:%S',
231 ])
232
233 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
234 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
235
236 NUMBER_RE = r'\d+(?:\.\d+)?'
237
238
239 @functools.cache
240 def preferredencoding():
241 """Get preferred encoding.
242
243 Returns the best encoding scheme for the system, based on
244 locale.getpreferredencoding() and some further tweaks.
245 """
246 try:
247 pref = locale.getpreferredencoding()
248 'TEST'.encode(pref)
249 except Exception:
250 pref = 'UTF-8'
251
252 return pref
253
254
255 def write_json_file(obj, fn):
256 """ Encode obj as JSON and write it to fn, atomically if possible """
257
258 tf = tempfile.NamedTemporaryFile(
259 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
260 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
261
262 try:
263 with tf:
264 json.dump(obj, tf, ensure_ascii=False)
265 if sys.platform == 'win32':
266 # Need to remove existing file on Windows, else os.rename raises
267 # WindowsError or FileExistsError.
268 with contextlib.suppress(OSError):
269 os.unlink(fn)
270 with contextlib.suppress(OSError):
271 mask = os.umask(0)
272 os.umask(mask)
273 os.chmod(tf.name, 0o666 & ~mask)
274 os.rename(tf.name, fn)
275 except Exception:
276 with contextlib.suppress(OSError):
277 os.remove(tf.name)
278 raise
279
280
281 def find_xpath_attr(node, xpath, key, val=None):
282 """ Find the xpath xpath[@key=val] """
283 assert re.match(r'^[a-zA-Z_-]+$', key)
284 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
285 return node.find(expr)
286
287 # On python2.6 the xml.etree.ElementTree.Element methods don't support
288 # the namespace parameter
289
290
291 def xpath_with_ns(path, ns_map):
292 components = [c.split(':') for c in path.split('/')]
293 replaced = []
294 for c in components:
295 if len(c) == 1:
296 replaced.append(c[0])
297 else:
298 ns, tag = c
299 replaced.append('{%s}%s' % (ns_map[ns], tag))
300 return '/'.join(replaced)
301
302
303 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
304 def _find_xpath(xpath):
305 return node.find(xpath)
306
307 if isinstance(xpath, str):
308 n = _find_xpath(xpath)
309 else:
310 for xp in xpath:
311 n = _find_xpath(xp)
312 if n is not None:
313 break
314
315 if n is None:
316 if default is not NO_DEFAULT:
317 return default
318 elif fatal:
319 name = xpath if name is None else name
320 raise ExtractorError('Could not find XML element %s' % name)
321 else:
322 return None
323 return n
324
325
326 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
327 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
328 if n is None or n == default:
329 return n
330 if n.text is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = xpath if name is None else name
335 raise ExtractorError('Could not find XML element\'s text %s' % name)
336 else:
337 return None
338 return n.text
339
340
341 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
342 n = find_xpath_attr(node, xpath, key)
343 if n is None:
344 if default is not NO_DEFAULT:
345 return default
346 elif fatal:
347 name = f'{xpath}[@{key}]' if name is None else name
348 raise ExtractorError('Could not find XML attribute %s' % name)
349 else:
350 return None
351 return n.attrib[key]
352
353
354 def get_element_by_id(id, html, **kwargs):
355 """Return the content of the tag with the specified ID in the passed HTML document"""
356 return get_element_by_attribute('id', id, html, **kwargs)
357
358
359 def get_element_html_by_id(id, html, **kwargs):
360 """Return the html of the tag with the specified ID in the passed HTML document"""
361 return get_element_html_by_attribute('id', id, html, **kwargs)
362
363
364 def get_element_by_class(class_name, html):
365 """Return the content of the first tag with the specified class in the passed HTML document"""
366 retval = get_elements_by_class(class_name, html)
367 return retval[0] if retval else None
368
369
370 def get_element_html_by_class(class_name, html):
371 """Return the html of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_html_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
376 def get_element_by_attribute(attribute, value, html, **kwargs):
377 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
378 return retval[0] if retval else None
379
380
381 def get_element_html_by_attribute(attribute, value, html, **kargs):
382 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
383 return retval[0] if retval else None
384
385
386 def get_elements_by_class(class_name, html, **kargs):
387 """Return the content of all tags with the specified class in the passed HTML document as a list"""
388 return get_elements_by_attribute(
389 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
390 html, escape_value=False)
391
392
393 def get_elements_html_by_class(class_name, html):
394 """Return the html of all tags with the specified class in the passed HTML document as a list"""
395 return get_elements_html_by_attribute(
396 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
397 html, escape_value=False)
398
399
400 def get_elements_by_attribute(*args, **kwargs):
401 """Return the content of the tag with the specified attribute in the passed HTML document"""
402 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
403
404
405 def get_elements_html_by_attribute(*args, **kwargs):
406 """Return the html of the tag with the specified attribute in the passed HTML document"""
407 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
408
409
410 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
411 """
412 Return the text (content) and the html (whole) of the tag with the specified
413 attribute in the passed HTML document
414 """
415
416 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
417
418 value = re.escape(value) if escape_value else value
419
420 partial_element_re = rf'''(?x)
421 <(?P<tag>[a-zA-Z0-9:._-]+)
422 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
423 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
424 '''
425
426 for m in re.finditer(partial_element_re, html):
427 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
428
429 yield (
430 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
431 whole
432 )
433
434
435 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
436 """
437 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
438 closing tag for the first opening tag it has encountered, and can be used
439 as a context manager
440 """
441
442 class HTMLBreakOnClosingTagException(Exception):
443 pass
444
445 def __init__(self):
446 self.tagstack = collections.deque()
447 html.parser.HTMLParser.__init__(self)
448
449 def __enter__(self):
450 return self
451
452 def __exit__(self, *_):
453 self.close()
454
455 def close(self):
456 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
457 # so data remains buffered; we no longer have any interest in it, thus
458 # override this method to discard it
459 pass
460
461 def handle_starttag(self, tag, _):
462 self.tagstack.append(tag)
463
464 def handle_endtag(self, tag):
465 if not self.tagstack:
466 raise compat_HTMLParseError('no tags in the stack')
467 while self.tagstack:
468 inner_tag = self.tagstack.pop()
469 if inner_tag == tag:
470 break
471 else:
472 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
473 if not self.tagstack:
474 raise self.HTMLBreakOnClosingTagException()
475
476
477 def get_element_text_and_html_by_tag(tag, html):
478 """
479 For the first element with the specified tag in the passed HTML document
480 return its' content (text) and the whole element (html)
481 """
482 def find_or_raise(haystack, needle, exc):
483 try:
484 return haystack.index(needle)
485 except ValueError:
486 raise exc
487 closing_tag = f'</{tag}>'
488 whole_start = find_or_raise(
489 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
490 content_start = find_or_raise(
491 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
492 content_start += whole_start + 1
493 with HTMLBreakOnClosingTagParser() as parser:
494 parser.feed(html[whole_start:content_start])
495 if not parser.tagstack or parser.tagstack[0] != tag:
496 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
497 offset = content_start
498 while offset < len(html):
499 next_closing_tag_start = find_or_raise(
500 html[offset:], closing_tag,
501 compat_HTMLParseError(f'closing {tag} tag not found'))
502 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
503 try:
504 parser.feed(html[offset:offset + next_closing_tag_end])
505 offset += next_closing_tag_end
506 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
507 return html[content_start:offset + next_closing_tag_start], \
508 html[whole_start:offset + next_closing_tag_end]
509 raise compat_HTMLParseError('unexpected end of html')
510
511
512 class HTMLAttributeParser(html.parser.HTMLParser):
513 """Trivial HTML parser to gather the attributes for a single element"""
514
515 def __init__(self):
516 self.attrs = {}
517 html.parser.HTMLParser.__init__(self)
518
519 def handle_starttag(self, tag, attrs):
520 self.attrs = dict(attrs)
521
522
523 class HTMLListAttrsParser(html.parser.HTMLParser):
524 """HTML parser to gather the attributes for the elements of a list"""
525
526 def __init__(self):
527 html.parser.HTMLParser.__init__(self)
528 self.items = []
529 self._level = 0
530
531 def handle_starttag(self, tag, attrs):
532 if tag == 'li' and self._level == 0:
533 self.items.append(dict(attrs))
534 self._level += 1
535
536 def handle_endtag(self, tag):
537 self._level -= 1
538
539
540 def extract_attributes(html_element):
541 """Given a string for an HTML element such as
542 <el
543 a="foo" B="bar" c="&98;az" d=boz
544 empty= noval entity="&amp;"
545 sq='"' dq="'"
546 >
547 Decode and return a dictionary of attributes.
548 {
549 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
550 'empty': '', 'noval': None, 'entity': '&',
551 'sq': '"', 'dq': '\''
552 }.
553 """
554 parser = HTMLAttributeParser()
555 with contextlib.suppress(compat_HTMLParseError):
556 parser.feed(html_element)
557 parser.close()
558 return parser.attrs
559
560
561 def parse_list(webpage):
562 """Given a string for an series of HTML <li> elements,
563 return a dictionary of their attributes"""
564 parser = HTMLListAttrsParser()
565 parser.feed(webpage)
566 parser.close()
567 return parser.items
568
569
570 def clean_html(html):
571 """Clean an HTML snippet into a readable string"""
572
573 if html is None: # Convenience for sanitizing descriptions etc.
574 return html
575
576 html = re.sub(r'\s+', ' ', html)
577 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
578 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
579 # Strip html tags
580 html = re.sub('<.*?>', '', html)
581 # Replace html entities
582 html = unescapeHTML(html)
583 return html.strip()
584
585
586 class LenientJSONDecoder(json.JSONDecoder):
587 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
588 self.transform_source, self.ignore_extra = transform_source, ignore_extra
589 super().__init__(*args, **kwargs)
590
591 def decode(self, s):
592 if self.transform_source:
593 s = self.transform_source(s)
594 if self.ignore_extra:
595 return self.raw_decode(s.lstrip())[0]
596 return super().decode(s)
597
598
599 def sanitize_open(filename, open_mode):
600 """Try to open the given filename, and slightly tweak it if this fails.
601
602 Attempts to open the given filename. If this fails, it tries to change
603 the filename slightly, step by step, until it's either able to open it
604 or it fails and raises a final exception, like the standard open()
605 function.
606
607 It returns the tuple (stream, definitive_file_name).
608 """
609 if filename == '-':
610 if sys.platform == 'win32':
611 import msvcrt
612
613 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
614 with contextlib.suppress(io.UnsupportedOperation):
615 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
616 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
617
618 for attempt in range(2):
619 try:
620 try:
621 if sys.platform == 'win32':
622 # FIXME: An exclusive lock also locks the file from being read.
623 # Since windows locks are mandatory, don't lock the file on windows (for now).
624 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
625 raise LockingUnsupportedError()
626 stream = locked_file(filename, open_mode, block=False).__enter__()
627 except OSError:
628 stream = open(filename, open_mode)
629 return stream, filename
630 except OSError as err:
631 if attempt or err.errno in (errno.EACCES,):
632 raise
633 old_filename, filename = filename, sanitize_path(filename)
634 if old_filename == filename:
635 raise
636
637
638 def timeconvert(timestr):
639 """Convert RFC 2822 defined time string into system timestamp"""
640 timestamp = None
641 timetuple = email.utils.parsedate_tz(timestr)
642 if timetuple is not None:
643 timestamp = email.utils.mktime_tz(timetuple)
644 return timestamp
645
646
647 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
648 """Sanitizes a string so it could be used as part of a filename.
649 @param restricted Use a stricter subset of allowed characters
650 @param is_id Whether this is an ID that should be kept unchanged if possible.
651 If unset, yt-dlp's new sanitization rules are in effect
652 """
653 if s == '':
654 return ''
655
656 def replace_insane(char):
657 if restricted and char in ACCENT_CHARS:
658 return ACCENT_CHARS[char]
659 elif not restricted and char == '\n':
660 return '\0 '
661 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
662 # Replace with their full-width unicode counterparts
663 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
664 elif char == '?' or ord(char) < 32 or ord(char) == 127:
665 return ''
666 elif char == '"':
667 return '' if restricted else '\''
668 elif char == ':':
669 return '\0_\0-' if restricted else '\0 \0-'
670 elif char in '\\/|*<>':
671 return '\0_'
672 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
673 return '\0_'
674 return char
675
676 if restricted and is_id is NO_DEFAULT:
677 s = unicodedata.normalize('NFKC', s)
678 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
679 result = ''.join(map(replace_insane, s))
680 if is_id is NO_DEFAULT:
681 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
682 STRIP_RE = r'(?:\0.|[ _-])*'
683 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
684 result = result.replace('\0', '') or '_'
685
686 if not is_id:
687 while '__' in result:
688 result = result.replace('__', '_')
689 result = result.strip('_')
690 # Common case of "Foreign band name - English song title"
691 if restricted and result.startswith('-_'):
692 result = result[2:]
693 if result.startswith('-'):
694 result = '_' + result[len('-'):]
695 result = result.lstrip('.')
696 if not result:
697 result = '_'
698 return result
699
700
701 def sanitize_path(s, force=False):
702 """Sanitizes and normalizes path on Windows"""
703 if sys.platform == 'win32':
704 force = False
705 drive_or_unc, _ = os.path.splitdrive(s)
706 elif force:
707 drive_or_unc = ''
708 else:
709 return s
710
711 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
712 if drive_or_unc:
713 norm_path.pop(0)
714 sanitized_path = [
715 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
716 for path_part in norm_path]
717 if drive_or_unc:
718 sanitized_path.insert(0, drive_or_unc + os.path.sep)
719 elif force and s and s[0] == os.path.sep:
720 sanitized_path.insert(0, os.path.sep)
721 return os.path.join(*sanitized_path)
722
723
724 def sanitize_url(url, *, scheme='http'):
725 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
726 # the number of unwanted failures due to missing protocol
727 if url is None:
728 return
729 elif url.startswith('//'):
730 return f'{scheme}:{url}'
731 # Fix some common typos seen so far
732 COMMON_TYPOS = (
733 # https://github.com/ytdl-org/youtube-dl/issues/15649
734 (r'^httpss://', r'https://'),
735 # https://bx1.be/lives/direct-tv/
736 (r'^rmtp([es]?)://', r'rtmp\1://'),
737 )
738 for mistake, fixup in COMMON_TYPOS:
739 if re.match(mistake, url):
740 return re.sub(mistake, fixup, url)
741 return url
742
743
744 def extract_basic_auth(url):
745 parts = urllib.parse.urlsplit(url)
746 if parts.username is None:
747 return url, None
748 url = urllib.parse.urlunsplit(parts._replace(netloc=(
749 parts.hostname if parts.port is None
750 else '%s:%d' % (parts.hostname, parts.port))))
751 auth_payload = base64.b64encode(
752 ('%s:%s' % (parts.username, parts.password or '')).encode())
753 return url, f'Basic {auth_payload.decode()}'
754
755
756 def sanitized_Request(url, *args, **kwargs):
757 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
758 if auth_header is not None:
759 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
760 headers['Authorization'] = auth_header
761 return urllib.request.Request(url, *args, **kwargs)
762
763
764 def expand_path(s):
765 """Expand shell variables and ~"""
766 return os.path.expandvars(compat_expanduser(s))
767
768
769 def orderedSet(iterable, *, lazy=False):
770 """Remove all duplicates from the input iterable"""
771 def _iter():
772 seen = [] # Do not use set since the items can be unhashable
773 for x in iterable:
774 if x not in seen:
775 seen.append(x)
776 yield x
777
778 return _iter() if lazy else list(_iter())
779
780
781 def _htmlentity_transform(entity_with_semicolon):
782 """Transforms an HTML entity to a character."""
783 entity = entity_with_semicolon[:-1]
784
785 # Known non-numeric HTML entity
786 if entity in html.entities.name2codepoint:
787 return chr(html.entities.name2codepoint[entity])
788
789 # TODO: HTML5 allows entities without a semicolon.
790 # E.g. '&Eacuteric' should be decoded as 'Éric'.
791 if entity_with_semicolon in html.entities.html5:
792 return html.entities.html5[entity_with_semicolon]
793
794 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
795 if mobj is not None:
796 numstr = mobj.group(1)
797 if numstr.startswith('x'):
798 base = 16
799 numstr = '0%s' % numstr
800 else:
801 base = 10
802 # See https://github.com/ytdl-org/youtube-dl/issues/7518
803 with contextlib.suppress(ValueError):
804 return chr(int(numstr, base))
805
806 # Unknown entity in name, return its literal representation
807 return '&%s;' % entity
808
809
810 def unescapeHTML(s):
811 if s is None:
812 return None
813 assert isinstance(s, str)
814
815 return re.sub(
816 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
817
818
819 def escapeHTML(text):
820 return (
821 text
822 .replace('&', '&amp;')
823 .replace('<', '&lt;')
824 .replace('>', '&gt;')
825 .replace('"', '&quot;')
826 .replace("'", '&#39;')
827 )
828
829
830 def process_communicate_or_kill(p, *args, **kwargs):
831 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
832 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
833 return Popen.communicate_or_kill(p, *args, **kwargs)
834
835
836 class Popen(subprocess.Popen):
837 if sys.platform == 'win32':
838 _startupinfo = subprocess.STARTUPINFO()
839 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
840 else:
841 _startupinfo = None
842
843 def __init__(self, *args, text=False, **kwargs):
844 if text is True:
845 kwargs['universal_newlines'] = True # For 3.6 compatibility
846 kwargs.setdefault('encoding', 'utf-8')
847 kwargs.setdefault('errors', 'replace')
848 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
849
850 def communicate_or_kill(self, *args, **kwargs):
851 try:
852 return self.communicate(*args, **kwargs)
853 except BaseException: # Including KeyboardInterrupt
854 self.kill(timeout=None)
855 raise
856
857 def kill(self, *, timeout=0):
858 super().kill()
859 if timeout != 0:
860 self.wait(timeout=timeout)
861
862 @classmethod
863 def run(cls, *args, **kwargs):
864 with cls(*args, **kwargs) as proc:
865 stdout, stderr = proc.communicate_or_kill()
866 return stdout or '', stderr or '', proc.returncode
867
868
869 def get_subprocess_encoding():
870 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
871 # For subprocess calls, encode with locale encoding
872 # Refer to http://stackoverflow.com/a/9951851/35070
873 encoding = preferredencoding()
874 else:
875 encoding = sys.getfilesystemencoding()
876 if encoding is None:
877 encoding = 'utf-8'
878 return encoding
879
880
881 def encodeFilename(s, for_subprocess=False):
882 assert isinstance(s, str)
883 return s
884
885
886 def decodeFilename(b, for_subprocess=False):
887 return b
888
889
890 def encodeArgument(s):
891 # Legacy code that uses byte strings
892 # Uncomment the following line after fixing all post processors
893 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
894 return s if isinstance(s, str) else s.decode('ascii')
895
896
897 def decodeArgument(b):
898 return b
899
900
901 def decodeOption(optval):
902 if optval is None:
903 return optval
904 if isinstance(optval, bytes):
905 optval = optval.decode(preferredencoding())
906
907 assert isinstance(optval, str)
908 return optval
909
910
911 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
912
913
914 def timetuple_from_msec(msec):
915 secs, msec = divmod(msec, 1000)
916 mins, secs = divmod(secs, 60)
917 hrs, mins = divmod(mins, 60)
918 return _timetuple(hrs, mins, secs, msec)
919
920
921 def formatSeconds(secs, delim=':', msec=False):
922 time = timetuple_from_msec(secs * 1000)
923 if time.hours:
924 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
925 elif time.minutes:
926 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
927 else:
928 ret = '%d' % time.seconds
929 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
930
931
932 def _ssl_load_windows_store_certs(ssl_context, storename):
933 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
934 try:
935 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
936 if encoding == 'x509_asn' and (
937 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
938 except PermissionError:
939 return
940 for cert in certs:
941 with contextlib.suppress(ssl.SSLError):
942 ssl_context.load_verify_locations(cadata=cert)
943
944
945 def make_HTTPS_handler(params, **kwargs):
946 opts_check_certificate = not params.get('nocheckcertificate')
947 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
948 context.check_hostname = opts_check_certificate
949 if params.get('legacyserverconnect'):
950 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
951 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
952 context.set_ciphers('DEFAULT')
953
954 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
955 if opts_check_certificate:
956 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
957 context.load_verify_locations(cafile=certifi.where())
958 else:
959 try:
960 context.load_default_certs()
961 # Work around the issue in load_default_certs when there are bad certificates. See:
962 # https://github.com/yt-dlp/yt-dlp/issues/1060,
963 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
964 except ssl.SSLError:
965 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
966 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
967 for storename in ('CA', 'ROOT'):
968 _ssl_load_windows_store_certs(context, storename)
969 context.set_default_verify_paths()
970
971 client_certfile = params.get('client_certificate')
972 if client_certfile:
973 try:
974 context.load_cert_chain(
975 client_certfile, keyfile=params.get('client_certificate_key'),
976 password=params.get('client_certificate_password'))
977 except ssl.SSLError:
978 raise YoutubeDLError('Unable to load client certificate')
979
980 # Some servers may reject requests if ALPN extension is not sent. See:
981 # https://github.com/python/cpython/issues/85140
982 # https://github.com/yt-dlp/yt-dlp/issues/3878
983 with contextlib.suppress(NotImplementedError):
984 context.set_alpn_protocols(['http/1.1'])
985
986 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
987
988
989 def bug_reports_message(before=';'):
990 from .update import REPOSITORY
991
992 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
993 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
994
995 before = before.rstrip()
996 if not before or before.endswith(('.', '!', '?')):
997 msg = msg[0].title() + msg[1:]
998
999 return (before + ' ' if before else '') + msg
1000
1001
1002 class YoutubeDLError(Exception):
1003 """Base exception for YoutubeDL errors."""
1004 msg = None
1005
1006 def __init__(self, msg=None):
1007 if msg is not None:
1008 self.msg = msg
1009 elif self.msg is None:
1010 self.msg = type(self).__name__
1011 super().__init__(self.msg)
1012
1013
1014 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1015 if hasattr(ssl, 'CertificateError'):
1016 network_exceptions.append(ssl.CertificateError)
1017 network_exceptions = tuple(network_exceptions)
1018
1019
1020 class ExtractorError(YoutubeDLError):
1021 """Error during info extraction."""
1022
1023 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1024 """ tb, if given, is the original traceback (so that it can be printed out).
1025 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1026 """
1027 if sys.exc_info()[0] in network_exceptions:
1028 expected = True
1029
1030 self.orig_msg = str(msg)
1031 self.traceback = tb
1032 self.expected = expected
1033 self.cause = cause
1034 self.video_id = video_id
1035 self.ie = ie
1036 self.exc_info = sys.exc_info() # preserve original exception
1037 if isinstance(self.exc_info[1], ExtractorError):
1038 self.exc_info = self.exc_info[1].exc_info
1039
1040 super().__init__(''.join((
1041 format_field(ie, None, '[%s] '),
1042 format_field(video_id, None, '%s: '),
1043 msg,
1044 format_field(cause, None, ' (caused by %r)'),
1045 '' if expected else bug_reports_message())))
1046
1047 def format_traceback(self):
1048 return join_nonempty(
1049 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1050 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1051 delim='\n') or None
1052
1053
1054 class UnsupportedError(ExtractorError):
1055 def __init__(self, url):
1056 super().__init__(
1057 'Unsupported URL: %s' % url, expected=True)
1058 self.url = url
1059
1060
1061 class RegexNotFoundError(ExtractorError):
1062 """Error when a regex didn't match"""
1063 pass
1064
1065
1066 class GeoRestrictedError(ExtractorError):
1067 """Geographic restriction Error exception.
1068
1069 This exception may be thrown when a video is not available from your
1070 geographic location due to geographic restrictions imposed by a website.
1071 """
1072
1073 def __init__(self, msg, countries=None, **kwargs):
1074 kwargs['expected'] = True
1075 super().__init__(msg, **kwargs)
1076 self.countries = countries
1077
1078
1079 class UserNotLive(ExtractorError):
1080 """Error when a channel/user is not live"""
1081
1082 def __init__(self, msg=None, **kwargs):
1083 kwargs['expected'] = True
1084 super().__init__(msg or 'The channel is not currently live', **kwargs)
1085
1086
1087 class DownloadError(YoutubeDLError):
1088 """Download Error exception.
1089
1090 This exception may be thrown by FileDownloader objects if they are not
1091 configured to continue on errors. They will contain the appropriate
1092 error message.
1093 """
1094
1095 def __init__(self, msg, exc_info=None):
1096 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1097 super().__init__(msg)
1098 self.exc_info = exc_info
1099
1100
1101 class EntryNotInPlaylist(YoutubeDLError):
1102 """Entry not in playlist exception.
1103
1104 This exception will be thrown by YoutubeDL when a requested entry
1105 is not found in the playlist info_dict
1106 """
1107 msg = 'Entry not found in info'
1108
1109
1110 class SameFileError(YoutubeDLError):
1111 """Same File exception.
1112
1113 This exception will be thrown by FileDownloader objects if they detect
1114 multiple files would have to be downloaded to the same file on disk.
1115 """
1116 msg = 'Fixed output name but more than one file to download'
1117
1118 def __init__(self, filename=None):
1119 if filename is not None:
1120 self.msg += f': {filename}'
1121 super().__init__(self.msg)
1122
1123
1124 class PostProcessingError(YoutubeDLError):
1125 """Post Processing exception.
1126
1127 This exception may be raised by PostProcessor's .run() method to
1128 indicate an error in the postprocessing task.
1129 """
1130
1131
1132 class DownloadCancelled(YoutubeDLError):
1133 """ Exception raised when the download queue should be interrupted """
1134 msg = 'The download was cancelled'
1135
1136
1137 class ExistingVideoReached(DownloadCancelled):
1138 """ --break-on-existing triggered """
1139 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1140
1141
1142 class RejectedVideoReached(DownloadCancelled):
1143 """ --break-on-reject triggered """
1144 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1145
1146
1147 class MaxDownloadsReached(DownloadCancelled):
1148 """ --max-downloads limit has been reached. """
1149 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1150
1151
1152 class ReExtractInfo(YoutubeDLError):
1153 """ Video info needs to be re-extracted. """
1154
1155 def __init__(self, msg, expected=False):
1156 super().__init__(msg)
1157 self.expected = expected
1158
1159
1160 class ThrottledDownload(ReExtractInfo):
1161 """ Download speed below --throttled-rate. """
1162 msg = 'The download speed is below throttle limit'
1163
1164 def __init__(self):
1165 super().__init__(self.msg, expected=False)
1166
1167
1168 class UnavailableVideoError(YoutubeDLError):
1169 """Unavailable Format exception.
1170
1171 This exception will be thrown when a video is requested
1172 in a format that is not available for that video.
1173 """
1174 msg = 'Unable to download video'
1175
1176 def __init__(self, err=None):
1177 if err is not None:
1178 self.msg += f': {err}'
1179 super().__init__(self.msg)
1180
1181
1182 class ContentTooShortError(YoutubeDLError):
1183 """Content Too Short exception.
1184
1185 This exception may be raised by FileDownloader objects when a file they
1186 download is too small for what the server announced first, indicating
1187 the connection was probably interrupted.
1188 """
1189
1190 def __init__(self, downloaded, expected):
1191 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1192 # Both in bytes
1193 self.downloaded = downloaded
1194 self.expected = expected
1195
1196
1197 class XAttrMetadataError(YoutubeDLError):
1198 def __init__(self, code=None, msg='Unknown error'):
1199 super().__init__(msg)
1200 self.code = code
1201 self.msg = msg
1202
1203 # Parsing code and msg
1204 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1205 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1206 self.reason = 'NO_SPACE'
1207 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1208 self.reason = 'VALUE_TOO_LONG'
1209 else:
1210 self.reason = 'NOT_SUPPORTED'
1211
1212
1213 class XAttrUnavailableError(YoutubeDLError):
1214 pass
1215
1216
1217 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1218 hc = http_class(*args, **kwargs)
1219 source_address = ydl_handler._params.get('source_address')
1220
1221 if source_address is not None:
1222 # This is to workaround _create_connection() from socket where it will try all
1223 # address data from getaddrinfo() including IPv6. This filters the result from
1224 # getaddrinfo() based on the source_address value.
1225 # This is based on the cpython socket.create_connection() function.
1226 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1227 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1228 host, port = address
1229 err = None
1230 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1231 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1232 ip_addrs = [addr for addr in addrs if addr[0] == af]
1233 if addrs and not ip_addrs:
1234 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1235 raise OSError(
1236 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1237 % (ip_version, source_address[0]))
1238 for res in ip_addrs:
1239 af, socktype, proto, canonname, sa = res
1240 sock = None
1241 try:
1242 sock = socket.socket(af, socktype, proto)
1243 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1244 sock.settimeout(timeout)
1245 sock.bind(source_address)
1246 sock.connect(sa)
1247 err = None # Explicitly break reference cycle
1248 return sock
1249 except OSError as _:
1250 err = _
1251 if sock is not None:
1252 sock.close()
1253 if err is not None:
1254 raise err
1255 else:
1256 raise OSError('getaddrinfo returns an empty list')
1257 if hasattr(hc, '_create_connection'):
1258 hc._create_connection = _create_connection
1259 hc.source_address = (source_address, 0)
1260
1261 return hc
1262
1263
1264 def handle_youtubedl_headers(headers):
1265 filtered_headers = headers
1266
1267 if 'Youtubedl-no-compression' in filtered_headers:
1268 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1269 del filtered_headers['Youtubedl-no-compression']
1270
1271 return filtered_headers
1272
1273
1274 class YoutubeDLHandler(urllib.request.HTTPHandler):
1275 """Handler for HTTP requests and responses.
1276
1277 This class, when installed with an OpenerDirector, automatically adds
1278 the standard headers to every HTTP request and handles gzipped and
1279 deflated responses from web servers. If compression is to be avoided in
1280 a particular request, the original request in the program code only has
1281 to include the HTTP header "Youtubedl-no-compression", which will be
1282 removed before making the real request.
1283
1284 Part of this code was copied from:
1285
1286 http://techknack.net/python-urllib2-handlers/
1287
1288 Andrew Rowls, the author of that code, agreed to release it to the
1289 public domain.
1290 """
1291
1292 def __init__(self, params, *args, **kwargs):
1293 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1294 self._params = params
1295
1296 def http_open(self, req):
1297 conn_class = http.client.HTTPConnection
1298
1299 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1300 if socks_proxy:
1301 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1302 del req.headers['Ytdl-socks-proxy']
1303
1304 return self.do_open(functools.partial(
1305 _create_http_connection, self, conn_class, False),
1306 req)
1307
1308 @staticmethod
1309 def deflate(data):
1310 if not data:
1311 return data
1312 try:
1313 return zlib.decompress(data, -zlib.MAX_WBITS)
1314 except zlib.error:
1315 return zlib.decompress(data)
1316
1317 @staticmethod
1318 def brotli(data):
1319 if not data:
1320 return data
1321 return brotli.decompress(data)
1322
1323 def http_request(self, req):
1324 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1325 # always respected by websites, some tend to give out URLs with non percent-encoded
1326 # non-ASCII characters (see telemb.py, ard.py [#3412])
1327 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1328 # To work around aforementioned issue we will replace request's original URL with
1329 # percent-encoded one
1330 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1331 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1332 url = req.get_full_url()
1333 url_escaped = escape_url(url)
1334
1335 # Substitute URL if any change after escaping
1336 if url != url_escaped:
1337 req = update_Request(req, url=url_escaped)
1338
1339 for h, v in self._params.get('http_headers', std_headers).items():
1340 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1341 # The dict keys are capitalized because of this bug by urllib
1342 if h.capitalize() not in req.headers:
1343 req.add_header(h, v)
1344
1345 if 'Accept-encoding' not in req.headers:
1346 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1347
1348 req.headers = handle_youtubedl_headers(req.headers)
1349
1350 return super().do_request_(req)
1351
1352 def http_response(self, req, resp):
1353 old_resp = resp
1354 # gzip
1355 if resp.headers.get('Content-encoding', '') == 'gzip':
1356 content = resp.read()
1357 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1358 try:
1359 uncompressed = io.BytesIO(gz.read())
1360 except OSError as original_ioerror:
1361 # There may be junk add the end of the file
1362 # See http://stackoverflow.com/q/4928560/35070 for details
1363 for i in range(1, 1024):
1364 try:
1365 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1366 uncompressed = io.BytesIO(gz.read())
1367 except OSError:
1368 continue
1369 break
1370 else:
1371 raise original_ioerror
1372 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1373 resp.msg = old_resp.msg
1374 del resp.headers['Content-encoding']
1375 # deflate
1376 if resp.headers.get('Content-encoding', '') == 'deflate':
1377 gz = io.BytesIO(self.deflate(resp.read()))
1378 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1379 resp.msg = old_resp.msg
1380 del resp.headers['Content-encoding']
1381 # brotli
1382 if resp.headers.get('Content-encoding', '') == 'br':
1383 resp = urllib.request.addinfourl(
1384 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1385 resp.msg = old_resp.msg
1386 del resp.headers['Content-encoding']
1387 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1388 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1389 if 300 <= resp.code < 400:
1390 location = resp.headers.get('Location')
1391 if location:
1392 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1393 location = location.encode('iso-8859-1').decode()
1394 location_escaped = escape_url(location)
1395 if location != location_escaped:
1396 del resp.headers['Location']
1397 resp.headers['Location'] = location_escaped
1398 return resp
1399
1400 https_request = http_request
1401 https_response = http_response
1402
1403
1404 def make_socks_conn_class(base_class, socks_proxy):
1405 assert issubclass(base_class, (
1406 http.client.HTTPConnection, http.client.HTTPSConnection))
1407
1408 url_components = urllib.parse.urlparse(socks_proxy)
1409 if url_components.scheme.lower() == 'socks5':
1410 socks_type = ProxyType.SOCKS5
1411 elif url_components.scheme.lower() in ('socks', 'socks4'):
1412 socks_type = ProxyType.SOCKS4
1413 elif url_components.scheme.lower() == 'socks4a':
1414 socks_type = ProxyType.SOCKS4A
1415
1416 def unquote_if_non_empty(s):
1417 if not s:
1418 return s
1419 return urllib.parse.unquote_plus(s)
1420
1421 proxy_args = (
1422 socks_type,
1423 url_components.hostname, url_components.port or 1080,
1424 True, # Remote DNS
1425 unquote_if_non_empty(url_components.username),
1426 unquote_if_non_empty(url_components.password),
1427 )
1428
1429 class SocksConnection(base_class):
1430 def connect(self):
1431 self.sock = sockssocket()
1432 self.sock.setproxy(*proxy_args)
1433 if isinstance(self.timeout, (int, float)):
1434 self.sock.settimeout(self.timeout)
1435 self.sock.connect((self.host, self.port))
1436
1437 if isinstance(self, http.client.HTTPSConnection):
1438 if hasattr(self, '_context'): # Python > 2.6
1439 self.sock = self._context.wrap_socket(
1440 self.sock, server_hostname=self.host)
1441 else:
1442 self.sock = ssl.wrap_socket(self.sock)
1443
1444 return SocksConnection
1445
1446
1447 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1448 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1449 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1450 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1451 self._params = params
1452
1453 def https_open(self, req):
1454 kwargs = {}
1455 conn_class = self._https_conn_class
1456
1457 if hasattr(self, '_context'): # python > 2.6
1458 kwargs['context'] = self._context
1459 if hasattr(self, '_check_hostname'): # python 3.x
1460 kwargs['check_hostname'] = self._check_hostname
1461
1462 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1463 if socks_proxy:
1464 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1465 del req.headers['Ytdl-socks-proxy']
1466
1467 try:
1468 return self.do_open(
1469 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1470 except urllib.error.URLError as e:
1471 if (isinstance(e.reason, ssl.SSLError)
1472 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1473 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1474 raise
1475
1476
1477 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1478 """
1479 See [1] for cookie file format.
1480
1481 1. https://curl.haxx.se/docs/http-cookies.html
1482 """
1483 _HTTPONLY_PREFIX = '#HttpOnly_'
1484 _ENTRY_LEN = 7
1485 _HEADER = '''# Netscape HTTP Cookie File
1486 # This file is generated by yt-dlp. Do not edit.
1487
1488 '''
1489 _CookieFileEntry = collections.namedtuple(
1490 'CookieFileEntry',
1491 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1492
1493 def __init__(self, filename=None, *args, **kwargs):
1494 super().__init__(None, *args, **kwargs)
1495 if self.is_path(filename):
1496 filename = os.fspath(filename)
1497 self.filename = filename
1498
1499 @staticmethod
1500 def _true_or_false(cndn):
1501 return 'TRUE' if cndn else 'FALSE'
1502
1503 @staticmethod
1504 def is_path(file):
1505 return isinstance(file, (str, bytes, os.PathLike))
1506
1507 @contextlib.contextmanager
1508 def open(self, file, *, write=False):
1509 if self.is_path(file):
1510 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1511 yield f
1512 else:
1513 if write:
1514 file.truncate(0)
1515 yield file
1516
1517 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1518 now = time.time()
1519 for cookie in self:
1520 if (not ignore_discard and cookie.discard
1521 or not ignore_expires and cookie.is_expired(now)):
1522 continue
1523 name, value = cookie.name, cookie.value
1524 if value is None:
1525 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1526 # with no name, whereas http.cookiejar regards it as a
1527 # cookie with no value.
1528 name, value = '', name
1529 f.write('%s\n' % '\t'.join((
1530 cookie.domain,
1531 self._true_or_false(cookie.domain.startswith('.')),
1532 cookie.path,
1533 self._true_or_false(cookie.secure),
1534 str_or_none(cookie.expires, default=''),
1535 name, value
1536 )))
1537
1538 def save(self, filename=None, *args, **kwargs):
1539 """
1540 Save cookies to a file.
1541 Code is taken from CPython 3.6
1542 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1543
1544 if filename is None:
1545 if self.filename is not None:
1546 filename = self.filename
1547 else:
1548 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1549
1550 # Store session cookies with `expires` set to 0 instead of an empty string
1551 for cookie in self:
1552 if cookie.expires is None:
1553 cookie.expires = 0
1554
1555 with self.open(filename, write=True) as f:
1556 f.write(self._HEADER)
1557 self._really_save(f, *args, **kwargs)
1558
1559 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1560 """Load cookies from a file."""
1561 if filename is None:
1562 if self.filename is not None:
1563 filename = self.filename
1564 else:
1565 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1566
1567 def prepare_line(line):
1568 if line.startswith(self._HTTPONLY_PREFIX):
1569 line = line[len(self._HTTPONLY_PREFIX):]
1570 # comments and empty lines are fine
1571 if line.startswith('#') or not line.strip():
1572 return line
1573 cookie_list = line.split('\t')
1574 if len(cookie_list) != self._ENTRY_LEN:
1575 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1576 cookie = self._CookieFileEntry(*cookie_list)
1577 if cookie.expires_at and not cookie.expires_at.isdigit():
1578 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1579 return line
1580
1581 cf = io.StringIO()
1582 with self.open(filename) as f:
1583 for line in f:
1584 try:
1585 cf.write(prepare_line(line))
1586 except http.cookiejar.LoadError as e:
1587 if f'{line.strip()} '[0] in '[{"':
1588 raise http.cookiejar.LoadError(
1589 'Cookies file must be Netscape formatted, not JSON. See '
1590 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
1591 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1592 continue
1593 cf.seek(0)
1594 self._really_load(cf, filename, ignore_discard, ignore_expires)
1595 # Session cookies are denoted by either `expires` field set to
1596 # an empty string or 0. MozillaCookieJar only recognizes the former
1597 # (see [1]). So we need force the latter to be recognized as session
1598 # cookies on our own.
1599 # Session cookies may be important for cookies-based authentication,
1600 # e.g. usually, when user does not check 'Remember me' check box while
1601 # logging in on a site, some important cookies are stored as session
1602 # cookies so that not recognizing them will result in failed login.
1603 # 1. https://bugs.python.org/issue17164
1604 for cookie in self:
1605 # Treat `expires=0` cookies as session cookies
1606 if cookie.expires == 0:
1607 cookie.expires = None
1608 cookie.discard = True
1609
1610
1611 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1612 def __init__(self, cookiejar=None):
1613 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1614
1615 def http_response(self, request, response):
1616 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1617
1618 https_request = urllib.request.HTTPCookieProcessor.http_request
1619 https_response = http_response
1620
1621
1622 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1623 """YoutubeDL redirect handler
1624
1625 The code is based on HTTPRedirectHandler implementation from CPython [1].
1626
1627 This redirect handler solves two issues:
1628 - ensures redirect URL is always unicode under python 2
1629 - introduces support for experimental HTTP response status code
1630 308 Permanent Redirect [2] used by some sites [3]
1631
1632 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1633 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1634 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1635 """
1636
1637 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1638
1639 def redirect_request(self, req, fp, code, msg, headers, newurl):
1640 """Return a Request or None in response to a redirect.
1641
1642 This is called by the http_error_30x methods when a
1643 redirection response is received. If a redirection should
1644 take place, return a new Request to allow http_error_30x to
1645 perform the redirect. Otherwise, raise HTTPError if no-one
1646 else should try to handle this url. Return None if you can't
1647 but another Handler might.
1648 """
1649 m = req.get_method()
1650 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1651 or code in (301, 302, 303) and m == "POST")):
1652 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1653 # Strictly (according to RFC 2616), 301 or 302 in response to
1654 # a POST MUST NOT cause a redirection without confirmation
1655 # from the user (of urllib.request, in this case). In practice,
1656 # essentially all clients do redirect in this case, so we do
1657 # the same.
1658
1659 # Be conciliant with URIs containing a space. This is mainly
1660 # redundant with the more complete encoding done in http_error_302(),
1661 # but it is kept for compatibility with other callers.
1662 newurl = newurl.replace(' ', '%20')
1663
1664 CONTENT_HEADERS = ("content-length", "content-type")
1665 # NB: don't use dict comprehension for python 2.6 compatibility
1666 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1667
1668 # A 303 must either use GET or HEAD for subsequent request
1669 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1670 if code == 303 and m != 'HEAD':
1671 m = 'GET'
1672 # 301 and 302 redirects are commonly turned into a GET from a POST
1673 # for subsequent requests by browsers, so we'll do the same.
1674 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1675 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1676 if code in (301, 302) and m == 'POST':
1677 m = 'GET'
1678
1679 return urllib.request.Request(
1680 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1681 unverifiable=True, method=m)
1682
1683
1684 def extract_timezone(date_str):
1685 m = re.search(
1686 r'''(?x)
1687 ^.{8,}? # >=8 char non-TZ prefix, if present
1688 (?P<tz>Z| # just the UTC Z, or
1689 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1690 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1691 [ ]? # optional space
1692 (?P<sign>\+|-) # +/-
1693 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1694 $)
1695 ''', date_str)
1696 if not m:
1697 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1698 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1699 if timezone is not None:
1700 date_str = date_str[:-len(m.group('tz'))]
1701 timezone = datetime.timedelta(hours=timezone or 0)
1702 else:
1703 date_str = date_str[:-len(m.group('tz'))]
1704 if not m.group('sign'):
1705 timezone = datetime.timedelta()
1706 else:
1707 sign = 1 if m.group('sign') == '+' else -1
1708 timezone = datetime.timedelta(
1709 hours=sign * int(m.group('hours')),
1710 minutes=sign * int(m.group('minutes')))
1711 return timezone, date_str
1712
1713
1714 def parse_iso8601(date_str, delimiter='T', timezone=None):
1715 """ Return a UNIX timestamp from the given date """
1716
1717 if date_str is None:
1718 return None
1719
1720 date_str = re.sub(r'\.[0-9]+', '', date_str)
1721
1722 if timezone is None:
1723 timezone, date_str = extract_timezone(date_str)
1724
1725 with contextlib.suppress(ValueError):
1726 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1727 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1728 return calendar.timegm(dt.timetuple())
1729
1730
1731 def date_formats(day_first=True):
1732 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1733
1734
1735 def unified_strdate(date_str, day_first=True):
1736 """Return a string with the date in the format YYYYMMDD"""
1737
1738 if date_str is None:
1739 return None
1740 upload_date = None
1741 # Replace commas
1742 date_str = date_str.replace(',', ' ')
1743 # Remove AM/PM + timezone
1744 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1745 _, date_str = extract_timezone(date_str)
1746
1747 for expression in date_formats(day_first):
1748 with contextlib.suppress(ValueError):
1749 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1750 if upload_date is None:
1751 timetuple = email.utils.parsedate_tz(date_str)
1752 if timetuple:
1753 with contextlib.suppress(ValueError):
1754 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1755 if upload_date is not None:
1756 return str(upload_date)
1757
1758
1759 def unified_timestamp(date_str, day_first=True):
1760 if date_str is None:
1761 return None
1762
1763 date_str = re.sub(r'\s+', ' ', re.sub(
1764 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1765
1766 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1767 timezone, date_str = extract_timezone(date_str)
1768
1769 # Remove AM/PM + timezone
1770 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1771
1772 # Remove unrecognized timezones from ISO 8601 alike timestamps
1773 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1774 if m:
1775 date_str = date_str[:-len(m.group('tz'))]
1776
1777 # Python only supports microseconds, so remove nanoseconds
1778 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1779 if m:
1780 date_str = m.group(1)
1781
1782 for expression in date_formats(day_first):
1783 with contextlib.suppress(ValueError):
1784 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1785 return calendar.timegm(dt.timetuple())
1786
1787 timetuple = email.utils.parsedate_tz(date_str)
1788 if timetuple:
1789 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1790
1791
1792 def determine_ext(url, default_ext='unknown_video'):
1793 if url is None or '.' not in url:
1794 return default_ext
1795 guess = url.partition('?')[0].rpartition('.')[2]
1796 if re.match(r'^[A-Za-z0-9]+$', guess):
1797 return guess
1798 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1799 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1800 return guess.rstrip('/')
1801 else:
1802 return default_ext
1803
1804
1805 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1806 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1807
1808
1809 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1810 R"""
1811 Return a datetime object from a string.
1812 Supported format:
1813 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1814
1815 @param format strftime format of DATE
1816 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1817 auto: round to the unit provided in date_str (if applicable).
1818 """
1819 auto_precision = False
1820 if precision == 'auto':
1821 auto_precision = True
1822 precision = 'microsecond'
1823 today = datetime_round(datetime.datetime.utcnow(), precision)
1824 if date_str in ('now', 'today'):
1825 return today
1826 if date_str == 'yesterday':
1827 return today - datetime.timedelta(days=1)
1828 match = re.match(
1829 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1830 date_str)
1831 if match is not None:
1832 start_time = datetime_from_str(match.group('start'), precision, format)
1833 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1834 unit = match.group('unit')
1835 if unit == 'month' or unit == 'year':
1836 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1837 unit = 'day'
1838 else:
1839 if unit == 'week':
1840 unit = 'day'
1841 time *= 7
1842 delta = datetime.timedelta(**{unit + 's': time})
1843 new_date = start_time + delta
1844 if auto_precision:
1845 return datetime_round(new_date, unit)
1846 return new_date
1847
1848 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1849
1850
1851 def date_from_str(date_str, format='%Y%m%d', strict=False):
1852 R"""
1853 Return a date object from a string using datetime_from_str
1854
1855 @param strict Restrict allowed patterns to "YYYYMMDD" and
1856 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1857 """
1858 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1859 raise ValueError(f'Invalid date format "{date_str}"')
1860 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1861
1862
1863 def datetime_add_months(dt, months):
1864 """Increment/Decrement a datetime object by months."""
1865 month = dt.month + months - 1
1866 year = dt.year + month // 12
1867 month = month % 12 + 1
1868 day = min(dt.day, calendar.monthrange(year, month)[1])
1869 return dt.replace(year, month, day)
1870
1871
1872 def datetime_round(dt, precision='day'):
1873 """
1874 Round a datetime object's time to a specific precision
1875 """
1876 if precision == 'microsecond':
1877 return dt
1878
1879 unit_seconds = {
1880 'day': 86400,
1881 'hour': 3600,
1882 'minute': 60,
1883 'second': 1,
1884 }
1885 roundto = lambda x, n: ((x + n / 2) // n) * n
1886 timestamp = calendar.timegm(dt.timetuple())
1887 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1888
1889
1890 def hyphenate_date(date_str):
1891 """
1892 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1893 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1894 if match is not None:
1895 return '-'.join(match.groups())
1896 else:
1897 return date_str
1898
1899
1900 class DateRange:
1901 """Represents a time interval between two dates"""
1902
1903 def __init__(self, start=None, end=None):
1904 """start and end must be strings in the format accepted by date"""
1905 if start is not None:
1906 self.start = date_from_str(start, strict=True)
1907 else:
1908 self.start = datetime.datetime.min.date()
1909 if end is not None:
1910 self.end = date_from_str(end, strict=True)
1911 else:
1912 self.end = datetime.datetime.max.date()
1913 if self.start > self.end:
1914 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1915
1916 @classmethod
1917 def day(cls, day):
1918 """Returns a range that only contains the given day"""
1919 return cls(day, day)
1920
1921 def __contains__(self, date):
1922 """Check if the date is in the range"""
1923 if not isinstance(date, datetime.date):
1924 date = date_from_str(date)
1925 return self.start <= date <= self.end
1926
1927 def __str__(self):
1928 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1929
1930 def __eq__(self, other):
1931 return (isinstance(other, DateRange)
1932 and self.start == other.start and self.end == other.end)
1933
1934
1935 def platform_name():
1936 """ Returns the platform name as a str """
1937 write_string('DeprecationWarning: yt_dlp.utils.platform_name is deprecated, use platform.platform instead')
1938 return platform.platform()
1939
1940
1941 @functools.cache
1942 def system_identifier():
1943 python_implementation = platform.python_implementation()
1944 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1945 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1946
1947 return 'Python %s (%s %s) - %s %s' % (
1948 platform.python_version(),
1949 python_implementation,
1950 platform.architecture()[0],
1951 platform.platform(),
1952 format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1953 )
1954
1955
1956 @functools.cache
1957 def get_windows_version():
1958 ''' Get Windows version. returns () if it's not running on Windows '''
1959 if compat_os_name == 'nt':
1960 return version_tuple(platform.win32_ver()[1])
1961 else:
1962 return ()
1963
1964
1965 def write_string(s, out=None, encoding=None):
1966 assert isinstance(s, str)
1967 out = out or sys.stderr
1968
1969 if compat_os_name == 'nt' and supports_terminal_sequences(out):
1970 s = re.sub(r'([\r\n]+)', r' \1', s)
1971
1972 enc, buffer = None, out
1973 if 'b' in getattr(out, 'mode', ''):
1974 enc = encoding or preferredencoding()
1975 elif hasattr(out, 'buffer'):
1976 buffer = out.buffer
1977 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1978
1979 buffer.write(s.encode(enc, 'ignore') if enc else s)
1980 out.flush()
1981
1982
1983 def bytes_to_intlist(bs):
1984 if not bs:
1985 return []
1986 if isinstance(bs[0], int): # Python 3
1987 return list(bs)
1988 else:
1989 return [ord(c) for c in bs]
1990
1991
1992 def intlist_to_bytes(xs):
1993 if not xs:
1994 return b''
1995 return struct.pack('%dB' % len(xs), *xs)
1996
1997
1998 class LockingUnsupportedError(OSError):
1999 msg = 'File locking is not supported'
2000
2001 def __init__(self):
2002 super().__init__(self.msg)
2003
2004
2005 # Cross-platform file locking
2006 if sys.platform == 'win32':
2007 import ctypes
2008 import ctypes.wintypes
2009 import msvcrt
2010
2011 class OVERLAPPED(ctypes.Structure):
2012 _fields_ = [
2013 ('Internal', ctypes.wintypes.LPVOID),
2014 ('InternalHigh', ctypes.wintypes.LPVOID),
2015 ('Offset', ctypes.wintypes.DWORD),
2016 ('OffsetHigh', ctypes.wintypes.DWORD),
2017 ('hEvent', ctypes.wintypes.HANDLE),
2018 ]
2019
2020 kernel32 = ctypes.windll.kernel32
2021 LockFileEx = kernel32.LockFileEx
2022 LockFileEx.argtypes = [
2023 ctypes.wintypes.HANDLE, # hFile
2024 ctypes.wintypes.DWORD, # dwFlags
2025 ctypes.wintypes.DWORD, # dwReserved
2026 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2027 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2028 ctypes.POINTER(OVERLAPPED) # Overlapped
2029 ]
2030 LockFileEx.restype = ctypes.wintypes.BOOL
2031 UnlockFileEx = kernel32.UnlockFileEx
2032 UnlockFileEx.argtypes = [
2033 ctypes.wintypes.HANDLE, # hFile
2034 ctypes.wintypes.DWORD, # dwReserved
2035 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2036 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2037 ctypes.POINTER(OVERLAPPED) # Overlapped
2038 ]
2039 UnlockFileEx.restype = ctypes.wintypes.BOOL
2040 whole_low = 0xffffffff
2041 whole_high = 0x7fffffff
2042
2043 def _lock_file(f, exclusive, block):
2044 overlapped = OVERLAPPED()
2045 overlapped.Offset = 0
2046 overlapped.OffsetHigh = 0
2047 overlapped.hEvent = 0
2048 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2049
2050 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2051 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2052 0, whole_low, whole_high, f._lock_file_overlapped_p):
2053 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2054 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2055
2056 def _unlock_file(f):
2057 assert f._lock_file_overlapped_p
2058 handle = msvcrt.get_osfhandle(f.fileno())
2059 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2060 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2061
2062 else:
2063 try:
2064 import fcntl
2065
2066 def _lock_file(f, exclusive, block):
2067 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2068 if not block:
2069 flags |= fcntl.LOCK_NB
2070 try:
2071 fcntl.flock(f, flags)
2072 except BlockingIOError:
2073 raise
2074 except OSError: # AOSP does not have flock()
2075 fcntl.lockf(f, flags)
2076
2077 def _unlock_file(f):
2078 try:
2079 fcntl.flock(f, fcntl.LOCK_UN)
2080 except OSError:
2081 fcntl.lockf(f, fcntl.LOCK_UN)
2082
2083 except ImportError:
2084
2085 def _lock_file(f, exclusive, block):
2086 raise LockingUnsupportedError()
2087
2088 def _unlock_file(f):
2089 raise LockingUnsupportedError()
2090
2091
2092 class locked_file:
2093 locked = False
2094
2095 def __init__(self, filename, mode, block=True, encoding=None):
2096 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2097 raise NotImplementedError(mode)
2098 self.mode, self.block = mode, block
2099
2100 writable = any(f in mode for f in 'wax+')
2101 readable = any(f in mode for f in 'r+')
2102 flags = functools.reduce(operator.ior, (
2103 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2104 getattr(os, 'O_BINARY', 0), # Windows only
2105 getattr(os, 'O_NOINHERIT', 0), # Windows only
2106 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2107 os.O_APPEND if 'a' in mode else 0,
2108 os.O_EXCL if 'x' in mode else 0,
2109 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2110 ))
2111
2112 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2113
2114 def __enter__(self):
2115 exclusive = 'r' not in self.mode
2116 try:
2117 _lock_file(self.f, exclusive, self.block)
2118 self.locked = True
2119 except OSError:
2120 self.f.close()
2121 raise
2122 if 'w' in self.mode:
2123 try:
2124 self.f.truncate()
2125 except OSError as e:
2126 if e.errno not in (
2127 errno.ESPIPE, # Illegal seek - expected for FIFO
2128 errno.EINVAL, # Invalid argument - expected for /dev/null
2129 ):
2130 raise
2131 return self
2132
2133 def unlock(self):
2134 if not self.locked:
2135 return
2136 try:
2137 _unlock_file(self.f)
2138 finally:
2139 self.locked = False
2140
2141 def __exit__(self, *_):
2142 try:
2143 self.unlock()
2144 finally:
2145 self.f.close()
2146
2147 open = __enter__
2148 close = __exit__
2149
2150 def __getattr__(self, attr):
2151 return getattr(self.f, attr)
2152
2153 def __iter__(self):
2154 return iter(self.f)
2155
2156
2157 @functools.cache
2158 def get_filesystem_encoding():
2159 encoding = sys.getfilesystemencoding()
2160 return encoding if encoding is not None else 'utf-8'
2161
2162
2163 def shell_quote(args):
2164 quoted_args = []
2165 encoding = get_filesystem_encoding()
2166 for a in args:
2167 if isinstance(a, bytes):
2168 # We may get a filename encoded with 'encodeFilename'
2169 a = a.decode(encoding)
2170 quoted_args.append(compat_shlex_quote(a))
2171 return ' '.join(quoted_args)
2172
2173
2174 def smuggle_url(url, data):
2175 """ Pass additional data in a URL for internal use. """
2176
2177 url, idata = unsmuggle_url(url, {})
2178 data.update(idata)
2179 sdata = urllib.parse.urlencode(
2180 {'__youtubedl_smuggle': json.dumps(data)})
2181 return url + '#' + sdata
2182
2183
2184 def unsmuggle_url(smug_url, default=None):
2185 if '#__youtubedl_smuggle' not in smug_url:
2186 return smug_url, default
2187 url, _, sdata = smug_url.rpartition('#')
2188 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2189 data = json.loads(jsond)
2190 return url, data
2191
2192
2193 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2194 """ Formats numbers with decimal sufixes like K, M, etc """
2195 num, factor = float_or_none(num), float(factor)
2196 if num is None or num < 0:
2197 return None
2198 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2199 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2200 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2201 if factor == 1024:
2202 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2203 converted = num / (factor ** exponent)
2204 return fmt % (converted, suffix)
2205
2206
2207 def format_bytes(bytes):
2208 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2209
2210
2211 def lookup_unit_table(unit_table, s):
2212 units_re = '|'.join(re.escape(u) for u in unit_table)
2213 m = re.match(
2214 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2215 if not m:
2216 return None
2217 num_str = m.group('num').replace(',', '.')
2218 mult = unit_table[m.group('unit')]
2219 return int(float(num_str) * mult)
2220
2221
2222 def parse_filesize(s):
2223 if s is None:
2224 return None
2225
2226 # The lower-case forms are of course incorrect and unofficial,
2227 # but we support those too
2228 _UNIT_TABLE = {
2229 'B': 1,
2230 'b': 1,
2231 'bytes': 1,
2232 'KiB': 1024,
2233 'KB': 1000,
2234 'kB': 1024,
2235 'Kb': 1000,
2236 'kb': 1000,
2237 'kilobytes': 1000,
2238 'kibibytes': 1024,
2239 'MiB': 1024 ** 2,
2240 'MB': 1000 ** 2,
2241 'mB': 1024 ** 2,
2242 'Mb': 1000 ** 2,
2243 'mb': 1000 ** 2,
2244 'megabytes': 1000 ** 2,
2245 'mebibytes': 1024 ** 2,
2246 'GiB': 1024 ** 3,
2247 'GB': 1000 ** 3,
2248 'gB': 1024 ** 3,
2249 'Gb': 1000 ** 3,
2250 'gb': 1000 ** 3,
2251 'gigabytes': 1000 ** 3,
2252 'gibibytes': 1024 ** 3,
2253 'TiB': 1024 ** 4,
2254 'TB': 1000 ** 4,
2255 'tB': 1024 ** 4,
2256 'Tb': 1000 ** 4,
2257 'tb': 1000 ** 4,
2258 'terabytes': 1000 ** 4,
2259 'tebibytes': 1024 ** 4,
2260 'PiB': 1024 ** 5,
2261 'PB': 1000 ** 5,
2262 'pB': 1024 ** 5,
2263 'Pb': 1000 ** 5,
2264 'pb': 1000 ** 5,
2265 'petabytes': 1000 ** 5,
2266 'pebibytes': 1024 ** 5,
2267 'EiB': 1024 ** 6,
2268 'EB': 1000 ** 6,
2269 'eB': 1024 ** 6,
2270 'Eb': 1000 ** 6,
2271 'eb': 1000 ** 6,
2272 'exabytes': 1000 ** 6,
2273 'exbibytes': 1024 ** 6,
2274 'ZiB': 1024 ** 7,
2275 'ZB': 1000 ** 7,
2276 'zB': 1024 ** 7,
2277 'Zb': 1000 ** 7,
2278 'zb': 1000 ** 7,
2279 'zettabytes': 1000 ** 7,
2280 'zebibytes': 1024 ** 7,
2281 'YiB': 1024 ** 8,
2282 'YB': 1000 ** 8,
2283 'yB': 1024 ** 8,
2284 'Yb': 1000 ** 8,
2285 'yb': 1000 ** 8,
2286 'yottabytes': 1000 ** 8,
2287 'yobibytes': 1024 ** 8,
2288 }
2289
2290 return lookup_unit_table(_UNIT_TABLE, s)
2291
2292
2293 def parse_count(s):
2294 if s is None:
2295 return None
2296
2297 s = re.sub(r'^[^\d]+\s', '', s).strip()
2298
2299 if re.match(r'^[\d,.]+$', s):
2300 return str_to_int(s)
2301
2302 _UNIT_TABLE = {
2303 'k': 1000,
2304 'K': 1000,
2305 'm': 1000 ** 2,
2306 'M': 1000 ** 2,
2307 'kk': 1000 ** 2,
2308 'KK': 1000 ** 2,
2309 'b': 1000 ** 3,
2310 'B': 1000 ** 3,
2311 }
2312
2313 ret = lookup_unit_table(_UNIT_TABLE, s)
2314 if ret is not None:
2315 return ret
2316
2317 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2318 if mobj:
2319 return str_to_int(mobj.group(1))
2320
2321
2322 def parse_resolution(s, *, lenient=False):
2323 if s is None:
2324 return {}
2325
2326 if lenient:
2327 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2328 else:
2329 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2330 if mobj:
2331 return {
2332 'width': int(mobj.group('w')),
2333 'height': int(mobj.group('h')),
2334 }
2335
2336 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2337 if mobj:
2338 return {'height': int(mobj.group(1))}
2339
2340 mobj = re.search(r'\b([48])[kK]\b', s)
2341 if mobj:
2342 return {'height': int(mobj.group(1)) * 540}
2343
2344 return {}
2345
2346
2347 def parse_bitrate(s):
2348 if not isinstance(s, str):
2349 return
2350 mobj = re.search(r'\b(\d+)\s*kbps', s)
2351 if mobj:
2352 return int(mobj.group(1))
2353
2354
2355 def month_by_name(name, lang='en'):
2356 """ Return the number of a month by (locale-independently) English name """
2357
2358 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2359
2360 try:
2361 return month_names.index(name) + 1
2362 except ValueError:
2363 return None
2364
2365
2366 def month_by_abbreviation(abbrev):
2367 """ Return the number of a month by (locale-independently) English
2368 abbreviations """
2369
2370 try:
2371 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2372 except ValueError:
2373 return None
2374
2375
2376 def fix_xml_ampersands(xml_str):
2377 """Replace all the '&' by '&amp;' in XML"""
2378 return re.sub(
2379 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2380 '&amp;',
2381 xml_str)
2382
2383
2384 def setproctitle(title):
2385 assert isinstance(title, str)
2386
2387 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2388 try:
2389 import ctypes
2390 except ImportError:
2391 return
2392
2393 try:
2394 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2395 except OSError:
2396 return
2397 except TypeError:
2398 # LoadLibrary in Windows Python 2.7.13 only expects
2399 # a bytestring, but since unicode_literals turns
2400 # every string into a unicode string, it fails.
2401 return
2402 title_bytes = title.encode()
2403 buf = ctypes.create_string_buffer(len(title_bytes))
2404 buf.value = title_bytes
2405 try:
2406 libc.prctl(15, buf, 0, 0, 0)
2407 except AttributeError:
2408 return # Strange libc, just skip this
2409
2410
2411 def remove_start(s, start):
2412 return s[len(start):] if s is not None and s.startswith(start) else s
2413
2414
2415 def remove_end(s, end):
2416 return s[:-len(end)] if s is not None and s.endswith(end) else s
2417
2418
2419 def remove_quotes(s):
2420 if s is None or len(s) < 2:
2421 return s
2422 for quote in ('"', "'", ):
2423 if s[0] == quote and s[-1] == quote:
2424 return s[1:-1]
2425 return s
2426
2427
2428 def get_domain(url):
2429 """
2430 This implementation is inconsistent, but is kept for compatibility.
2431 Use this only for "webpage_url_domain"
2432 """
2433 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2434
2435
2436 def url_basename(url):
2437 path = urllib.parse.urlparse(url).path
2438 return path.strip('/').split('/')[-1]
2439
2440
2441 def base_url(url):
2442 return re.match(r'https?://[^?#&]+/', url).group()
2443
2444
2445 def urljoin(base, path):
2446 if isinstance(path, bytes):
2447 path = path.decode()
2448 if not isinstance(path, str) or not path:
2449 return None
2450 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2451 return path
2452 if isinstance(base, bytes):
2453 base = base.decode()
2454 if not isinstance(base, str) or not re.match(
2455 r'^(?:https?:)?//', base):
2456 return None
2457 return urllib.parse.urljoin(base, path)
2458
2459
2460 class HEADRequest(urllib.request.Request):
2461 def get_method(self):
2462 return 'HEAD'
2463
2464
2465 class PUTRequest(urllib.request.Request):
2466 def get_method(self):
2467 return 'PUT'
2468
2469
2470 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2471 if get_attr and v is not None:
2472 v = getattr(v, get_attr, None)
2473 try:
2474 return int(v) * invscale // scale
2475 except (ValueError, TypeError, OverflowError):
2476 return default
2477
2478
2479 def str_or_none(v, default=None):
2480 return default if v is None else str(v)
2481
2482
2483 def str_to_int(int_str):
2484 """ A more relaxed version of int_or_none """
2485 if isinstance(int_str, int):
2486 return int_str
2487 elif isinstance(int_str, str):
2488 int_str = re.sub(r'[,\.\+]', '', int_str)
2489 return int_or_none(int_str)
2490
2491
2492 def float_or_none(v, scale=1, invscale=1, default=None):
2493 if v is None:
2494 return default
2495 try:
2496 return float(v) * invscale / scale
2497 except (ValueError, TypeError):
2498 return default
2499
2500
2501 def bool_or_none(v, default=None):
2502 return v if isinstance(v, bool) else default
2503
2504
2505 def strip_or_none(v, default=None):
2506 return v.strip() if isinstance(v, str) else default
2507
2508
2509 def url_or_none(url):
2510 if not url or not isinstance(url, str):
2511 return None
2512 url = url.strip()
2513 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2514
2515
2516 def request_to_url(req):
2517 if isinstance(req, urllib.request.Request):
2518 return req.get_full_url()
2519 else:
2520 return req
2521
2522
2523 def strftime_or_none(timestamp, date_format, default=None):
2524 datetime_object = None
2525 try:
2526 if isinstance(timestamp, (int, float)): # unix timestamp
2527 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2528 elif isinstance(timestamp, str): # assume YYYYMMDD
2529 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2530 return datetime_object.strftime(date_format)
2531 except (ValueError, TypeError, AttributeError):
2532 return default
2533
2534
2535 def parse_duration(s):
2536 if not isinstance(s, str):
2537 return None
2538 s = s.strip()
2539 if not s:
2540 return None
2541
2542 days, hours, mins, secs, ms = [None] * 5
2543 m = re.match(r'''(?x)
2544 (?P<before_secs>
2545 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2546 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2547 (?P<ms>[.:][0-9]+)?Z?$
2548 ''', s)
2549 if m:
2550 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2551 else:
2552 m = re.match(
2553 r'''(?ix)(?:P?
2554 (?:
2555 [0-9]+\s*y(?:ears?)?,?\s*
2556 )?
2557 (?:
2558 [0-9]+\s*m(?:onths?)?,?\s*
2559 )?
2560 (?:
2561 [0-9]+\s*w(?:eeks?)?,?\s*
2562 )?
2563 (?:
2564 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2565 )?
2566 T)?
2567 (?:
2568 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2569 )?
2570 (?:
2571 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2572 )?
2573 (?:
2574 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2575 )?Z?$''', s)
2576 if m:
2577 days, hours, mins, secs, ms = m.groups()
2578 else:
2579 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2580 if m:
2581 hours, mins = m.groups()
2582 else:
2583 return None
2584
2585 if ms:
2586 ms = ms.replace(':', '.')
2587 return sum(float(part or 0) * mult for part, mult in (
2588 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2589
2590
2591 def prepend_extension(filename, ext, expected_real_ext=None):
2592 name, real_ext = os.path.splitext(filename)
2593 return (
2594 f'{name}.{ext}{real_ext}'
2595 if not expected_real_ext or real_ext[1:] == expected_real_ext
2596 else f'{filename}.{ext}')
2597
2598
2599 def replace_extension(filename, ext, expected_real_ext=None):
2600 name, real_ext = os.path.splitext(filename)
2601 return '{}.{}'.format(
2602 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2603 ext)
2604
2605
2606 def check_executable(exe, args=[]):
2607 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2608 args can be a list of arguments for a short output (like -version) """
2609 try:
2610 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2611 except OSError:
2612 return False
2613 return exe
2614
2615
2616 def _get_exe_version_output(exe, args, *, to_screen=None):
2617 if to_screen:
2618 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2619 try:
2620 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2621 # SIGTTOU if yt-dlp is run in the background.
2622 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2623 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2624 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2625 except OSError:
2626 return False
2627 return stdout
2628
2629
2630 def detect_exe_version(output, version_re=None, unrecognized='present'):
2631 assert isinstance(output, str)
2632 if version_re is None:
2633 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2634 m = re.search(version_re, output)
2635 if m:
2636 return m.group(1)
2637 else:
2638 return unrecognized
2639
2640
2641 def get_exe_version(exe, args=['--version'],
2642 version_re=None, unrecognized='present'):
2643 """ Returns the version of the specified executable,
2644 or False if the executable is not present """
2645 out = _get_exe_version_output(exe, args)
2646 return detect_exe_version(out, version_re, unrecognized) if out else False
2647
2648
2649 def frange(start=0, stop=None, step=1):
2650 """Float range"""
2651 if stop is None:
2652 start, stop = 0, start
2653 sign = [-1, 1][step > 0] if step else 0
2654 while sign * start < sign * stop:
2655 yield start
2656 start += step
2657
2658
2659 class LazyList(collections.abc.Sequence):
2660 """Lazy immutable list from an iterable
2661 Note that slices of a LazyList are lists and not LazyList"""
2662
2663 class IndexError(IndexError):
2664 pass
2665
2666 def __init__(self, iterable, *, reverse=False, _cache=None):
2667 self._iterable = iter(iterable)
2668 self._cache = [] if _cache is None else _cache
2669 self._reversed = reverse
2670
2671 def __iter__(self):
2672 if self._reversed:
2673 # We need to consume the entire iterable to iterate in reverse
2674 yield from self.exhaust()
2675 return
2676 yield from self._cache
2677 for item in self._iterable:
2678 self._cache.append(item)
2679 yield item
2680
2681 def _exhaust(self):
2682 self._cache.extend(self._iterable)
2683 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2684 return self._cache
2685
2686 def exhaust(self):
2687 """Evaluate the entire iterable"""
2688 return self._exhaust()[::-1 if self._reversed else 1]
2689
2690 @staticmethod
2691 def _reverse_index(x):
2692 return None if x is None else ~x
2693
2694 def __getitem__(self, idx):
2695 if isinstance(idx, slice):
2696 if self._reversed:
2697 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2698 start, stop, step = idx.start, idx.stop, idx.step or 1
2699 elif isinstance(idx, int):
2700 if self._reversed:
2701 idx = self._reverse_index(idx)
2702 start, stop, step = idx, idx, 0
2703 else:
2704 raise TypeError('indices must be integers or slices')
2705 if ((start or 0) < 0 or (stop or 0) < 0
2706 or (start is None and step < 0)
2707 or (stop is None and step > 0)):
2708 # We need to consume the entire iterable to be able to slice from the end
2709 # Obviously, never use this with infinite iterables
2710 self._exhaust()
2711 try:
2712 return self._cache[idx]
2713 except IndexError as e:
2714 raise self.IndexError(e) from e
2715 n = max(start or 0, stop or 0) - len(self._cache) + 1
2716 if n > 0:
2717 self._cache.extend(itertools.islice(self._iterable, n))
2718 try:
2719 return self._cache[idx]
2720 except IndexError as e:
2721 raise self.IndexError(e) from e
2722
2723 def __bool__(self):
2724 try:
2725 self[-1] if self._reversed else self[0]
2726 except self.IndexError:
2727 return False
2728 return True
2729
2730 def __len__(self):
2731 self._exhaust()
2732 return len(self._cache)
2733
2734 def __reversed__(self):
2735 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2736
2737 def __copy__(self):
2738 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2739
2740 def __repr__(self):
2741 # repr and str should mimic a list. So we exhaust the iterable
2742 return repr(self.exhaust())
2743
2744 def __str__(self):
2745 return repr(self.exhaust())
2746
2747
2748 class PagedList:
2749
2750 class IndexError(IndexError):
2751 pass
2752
2753 def __len__(self):
2754 # This is only useful for tests
2755 return len(self.getslice())
2756
2757 def __init__(self, pagefunc, pagesize, use_cache=True):
2758 self._pagefunc = pagefunc
2759 self._pagesize = pagesize
2760 self._pagecount = float('inf')
2761 self._use_cache = use_cache
2762 self._cache = {}
2763
2764 def getpage(self, pagenum):
2765 page_results = self._cache.get(pagenum)
2766 if page_results is None:
2767 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2768 if self._use_cache:
2769 self._cache[pagenum] = page_results
2770 return page_results
2771
2772 def getslice(self, start=0, end=None):
2773 return list(self._getslice(start, end))
2774
2775 def _getslice(self, start, end):
2776 raise NotImplementedError('This method must be implemented by subclasses')
2777
2778 def __getitem__(self, idx):
2779 assert self._use_cache, 'Indexing PagedList requires cache'
2780 if not isinstance(idx, int) or idx < 0:
2781 raise TypeError('indices must be non-negative integers')
2782 entries = self.getslice(idx, idx + 1)
2783 if not entries:
2784 raise self.IndexError()
2785 return entries[0]
2786
2787
2788 class OnDemandPagedList(PagedList):
2789 """Download pages until a page with less than maximum results"""
2790
2791 def _getslice(self, start, end):
2792 for pagenum in itertools.count(start // self._pagesize):
2793 firstid = pagenum * self._pagesize
2794 nextfirstid = pagenum * self._pagesize + self._pagesize
2795 if start >= nextfirstid:
2796 continue
2797
2798 startv = (
2799 start % self._pagesize
2800 if firstid <= start < nextfirstid
2801 else 0)
2802 endv = (
2803 ((end - 1) % self._pagesize) + 1
2804 if (end is not None and firstid <= end <= nextfirstid)
2805 else None)
2806
2807 try:
2808 page_results = self.getpage(pagenum)
2809 except Exception:
2810 self._pagecount = pagenum - 1
2811 raise
2812 if startv != 0 or endv is not None:
2813 page_results = page_results[startv:endv]
2814 yield from page_results
2815
2816 # A little optimization - if current page is not "full", ie. does
2817 # not contain page_size videos then we can assume that this page
2818 # is the last one - there are no more ids on further pages -
2819 # i.e. no need to query again.
2820 if len(page_results) + startv < self._pagesize:
2821 break
2822
2823 # If we got the whole page, but the next page is not interesting,
2824 # break out early as well
2825 if end == nextfirstid:
2826 break
2827
2828
2829 class InAdvancePagedList(PagedList):
2830 """PagedList with total number of pages known in advance"""
2831
2832 def __init__(self, pagefunc, pagecount, pagesize):
2833 PagedList.__init__(self, pagefunc, pagesize, True)
2834 self._pagecount = pagecount
2835
2836 def _getslice(self, start, end):
2837 start_page = start // self._pagesize
2838 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2839 skip_elems = start - start_page * self._pagesize
2840 only_more = None if end is None else end - start
2841 for pagenum in range(start_page, end_page):
2842 page_results = self.getpage(pagenum)
2843 if skip_elems:
2844 page_results = page_results[skip_elems:]
2845 skip_elems = None
2846 if only_more is not None:
2847 if len(page_results) < only_more:
2848 only_more -= len(page_results)
2849 else:
2850 yield from page_results[:only_more]
2851 break
2852 yield from page_results
2853
2854
2855 class PlaylistEntries:
2856 MissingEntry = object()
2857 is_exhausted = False
2858
2859 def __init__(self, ydl, info_dict):
2860 self.ydl = ydl
2861
2862 # _entries must be assigned now since infodict can change during iteration
2863 entries = info_dict.get('entries')
2864 if entries is None:
2865 raise EntryNotInPlaylist('There are no entries')
2866 elif isinstance(entries, list):
2867 self.is_exhausted = True
2868
2869 requested_entries = info_dict.get('requested_entries')
2870 self.is_incomplete = bool(requested_entries)
2871 if self.is_incomplete:
2872 assert self.is_exhausted
2873 self._entries = [self.MissingEntry] * max(requested_entries)
2874 for i, entry in zip(requested_entries, entries):
2875 self._entries[i - 1] = entry
2876 elif isinstance(entries, (list, PagedList, LazyList)):
2877 self._entries = entries
2878 else:
2879 self._entries = LazyList(entries)
2880
2881 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2882 (?P<start>[+-]?\d+)?
2883 (?P<range>[:-]
2884 (?P<end>[+-]?\d+|inf(?:inite)?)?
2885 (?::(?P<step>[+-]?\d+))?
2886 )?''')
2887
2888 @classmethod
2889 def parse_playlist_items(cls, string):
2890 for segment in string.split(','):
2891 if not segment:
2892 raise ValueError('There is two or more consecutive commas')
2893 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2894 if not mobj:
2895 raise ValueError(f'{segment!r} is not a valid specification')
2896 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2897 if int_or_none(step) == 0:
2898 raise ValueError(f'Step in {segment!r} cannot be zero')
2899 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2900
2901 def get_requested_items(self):
2902 playlist_items = self.ydl.params.get('playlist_items')
2903 playlist_start = self.ydl.params.get('playliststart', 1)
2904 playlist_end = self.ydl.params.get('playlistend')
2905 # For backwards compatibility, interpret -1 as whole list
2906 if playlist_end in (-1, None):
2907 playlist_end = ''
2908 if not playlist_items:
2909 playlist_items = f'{playlist_start}:{playlist_end}'
2910 elif playlist_start != 1 or playlist_end:
2911 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2912
2913 for index in self.parse_playlist_items(playlist_items):
2914 for i, entry in self[index]:
2915 yield i, entry
2916 if not entry:
2917 continue
2918 try:
2919 # TODO: Add auto-generated fields
2920 self.ydl._match_entry(entry, incomplete=True, silent=True)
2921 except (ExistingVideoReached, RejectedVideoReached):
2922 return
2923
2924 def get_full_count(self):
2925 if self.is_exhausted and not self.is_incomplete:
2926 return len(self)
2927 elif isinstance(self._entries, InAdvancePagedList):
2928 if self._entries._pagesize == 1:
2929 return self._entries._pagecount
2930
2931 @functools.cached_property
2932 def _getter(self):
2933 if isinstance(self._entries, list):
2934 def get_entry(i):
2935 try:
2936 entry = self._entries[i]
2937 except IndexError:
2938 entry = self.MissingEntry
2939 if not self.is_incomplete:
2940 raise self.IndexError()
2941 if entry is self.MissingEntry:
2942 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2943 return entry
2944 else:
2945 def get_entry(i):
2946 try:
2947 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2948 except (LazyList.IndexError, PagedList.IndexError):
2949 raise self.IndexError()
2950 return get_entry
2951
2952 def __getitem__(self, idx):
2953 if isinstance(idx, int):
2954 idx = slice(idx, idx)
2955
2956 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2957 step = 1 if idx.step is None else idx.step
2958 if idx.start is None:
2959 start = 0 if step > 0 else len(self) - 1
2960 else:
2961 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2962
2963 # NB: Do not call len(self) when idx == [:]
2964 if idx.stop is None:
2965 stop = 0 if step < 0 else float('inf')
2966 else:
2967 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2968 stop += [-1, 1][step > 0]
2969
2970 for i in frange(start, stop, step):
2971 if i < 0:
2972 continue
2973 try:
2974 entry = self._getter(i)
2975 except self.IndexError:
2976 self.is_exhausted = True
2977 if step > 0:
2978 break
2979 continue
2980 yield i + 1, entry
2981
2982 def __len__(self):
2983 return len(tuple(self[:]))
2984
2985 class IndexError(IndexError):
2986 pass
2987
2988
2989 def uppercase_escape(s):
2990 unicode_escape = codecs.getdecoder('unicode_escape')
2991 return re.sub(
2992 r'\\U[0-9a-fA-F]{8}',
2993 lambda m: unicode_escape(m.group(0))[0],
2994 s)
2995
2996
2997 def lowercase_escape(s):
2998 unicode_escape = codecs.getdecoder('unicode_escape')
2999 return re.sub(
3000 r'\\u[0-9a-fA-F]{4}',
3001 lambda m: unicode_escape(m.group(0))[0],
3002 s)
3003
3004
3005 def escape_rfc3986(s):
3006 """Escape non-ASCII characters as suggested by RFC 3986"""
3007 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3008
3009
3010 def escape_url(url):
3011 """Escape URL as suggested by RFC 3986"""
3012 url_parsed = urllib.parse.urlparse(url)
3013 return url_parsed._replace(
3014 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3015 path=escape_rfc3986(url_parsed.path),
3016 params=escape_rfc3986(url_parsed.params),
3017 query=escape_rfc3986(url_parsed.query),
3018 fragment=escape_rfc3986(url_parsed.fragment)
3019 ).geturl()
3020
3021
3022 def parse_qs(url):
3023 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
3024
3025
3026 def read_batch_urls(batch_fd):
3027 def fixup(url):
3028 if not isinstance(url, str):
3029 url = url.decode('utf-8', 'replace')
3030 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3031 for bom in BOM_UTF8:
3032 if url.startswith(bom):
3033 url = url[len(bom):]
3034 url = url.lstrip()
3035 if not url or url.startswith(('#', ';', ']')):
3036 return False
3037 # "#" cannot be stripped out since it is part of the URI
3038 # However, it can be safely stripped out if following a whitespace
3039 return re.split(r'\s#', url, 1)[0].rstrip()
3040
3041 with contextlib.closing(batch_fd) as fd:
3042 return [url for url in map(fixup, fd) if url]
3043
3044
3045 def urlencode_postdata(*args, **kargs):
3046 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3047
3048
3049 def update_url_query(url, query):
3050 if not query:
3051 return url
3052 parsed_url = urllib.parse.urlparse(url)
3053 qs = urllib.parse.parse_qs(parsed_url.query)
3054 qs.update(query)
3055 return urllib.parse.urlunparse(parsed_url._replace(
3056 query=urllib.parse.urlencode(qs, True)))
3057
3058
3059 def update_Request(req, url=None, data=None, headers=None, query=None):
3060 req_headers = req.headers.copy()
3061 req_headers.update(headers or {})
3062 req_data = data or req.data
3063 req_url = update_url_query(url or req.get_full_url(), query)
3064 req_get_method = req.get_method()
3065 if req_get_method == 'HEAD':
3066 req_type = HEADRequest
3067 elif req_get_method == 'PUT':
3068 req_type = PUTRequest
3069 else:
3070 req_type = urllib.request.Request
3071 new_req = req_type(
3072 req_url, data=req_data, headers=req_headers,
3073 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3074 if hasattr(req, 'timeout'):
3075 new_req.timeout = req.timeout
3076 return new_req
3077
3078
3079 def _multipart_encode_impl(data, boundary):
3080 content_type = 'multipart/form-data; boundary=%s' % boundary
3081
3082 out = b''
3083 for k, v in data.items():
3084 out += b'--' + boundary.encode('ascii') + b'\r\n'
3085 if isinstance(k, str):
3086 k = k.encode()
3087 if isinstance(v, str):
3088 v = v.encode()
3089 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3090 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3091 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3092 if boundary.encode('ascii') in content:
3093 raise ValueError('Boundary overlaps with data')
3094 out += content
3095
3096 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3097
3098 return out, content_type
3099
3100
3101 def multipart_encode(data, boundary=None):
3102 '''
3103 Encode a dict to RFC 7578-compliant form-data
3104
3105 data:
3106 A dict where keys and values can be either Unicode or bytes-like
3107 objects.
3108 boundary:
3109 If specified a Unicode object, it's used as the boundary. Otherwise
3110 a random boundary is generated.
3111
3112 Reference: https://tools.ietf.org/html/rfc7578
3113 '''
3114 has_specified_boundary = boundary is not None
3115
3116 while True:
3117 if boundary is None:
3118 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3119
3120 try:
3121 out, content_type = _multipart_encode_impl(data, boundary)
3122 break
3123 except ValueError:
3124 if has_specified_boundary:
3125 raise
3126 boundary = None
3127
3128 return out, content_type
3129
3130
3131 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3132 for val in map(d.get, variadic(key_or_keys)):
3133 if val is not None and (val or not skip_false_values):
3134 return val
3135 return default
3136
3137
3138 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3139 for f in funcs:
3140 try:
3141 val = f(*args, **kwargs)
3142 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
3143 pass
3144 else:
3145 if expected_type is None or isinstance(val, expected_type):
3146 return val
3147
3148
3149 def try_get(src, getter, expected_type=None):
3150 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3151
3152
3153 def filter_dict(dct, cndn=lambda _, v: v is not None):
3154 return {k: v for k, v in dct.items() if cndn(k, v)}
3155
3156
3157 def merge_dicts(*dicts):
3158 merged = {}
3159 for a_dict in dicts:
3160 for k, v in a_dict.items():
3161 if (v is not None and k not in merged
3162 or isinstance(v, str) and merged[k] == ''):
3163 merged[k] = v
3164 return merged
3165
3166
3167 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3168 return string if isinstance(string, str) else str(string, encoding, errors)
3169
3170
3171 US_RATINGS = {
3172 'G': 0,
3173 'PG': 10,
3174 'PG-13': 13,
3175 'R': 16,
3176 'NC': 18,
3177 }
3178
3179
3180 TV_PARENTAL_GUIDELINES = {
3181 'TV-Y': 0,
3182 'TV-Y7': 7,
3183 'TV-G': 0,
3184 'TV-PG': 0,
3185 'TV-14': 14,
3186 'TV-MA': 17,
3187 }
3188
3189
3190 def parse_age_limit(s):
3191 # isinstance(False, int) is True. So type() must be used instead
3192 if type(s) is int: # noqa: E721
3193 return s if 0 <= s <= 21 else None
3194 elif not isinstance(s, str):
3195 return None
3196 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3197 if m:
3198 return int(m.group('age'))
3199 s = s.upper()
3200 if s in US_RATINGS:
3201 return US_RATINGS[s]
3202 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3203 if m:
3204 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3205 return None
3206
3207
3208 def strip_jsonp(code):
3209 return re.sub(
3210 r'''(?sx)^
3211 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3212 (?:\s*&&\s*(?P=func_name))?
3213 \s*\(\s*(?P<callback_data>.*)\);?
3214 \s*?(?://[^\n]*)*$''',
3215 r'\g<callback_data>', code)
3216
3217
3218 def js_to_json(code, vars={}, *, strict=False):
3219 # vars is a dict of var, val pairs to substitute
3220 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3221 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3222 INTEGER_TABLE = (
3223 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3224 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3225 )
3226
3227 def fix_kv(m):
3228 v = m.group(0)
3229 if v in ('true', 'false', 'null'):
3230 return v
3231 elif v in ('undefined', 'void 0'):
3232 return 'null'
3233 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3234 return ""
3235
3236 if v[0] in ("'", '"'):
3237 v = re.sub(r'(?s)\\.|"', lambda m: {
3238 '"': '\\"',
3239 "\\'": "'",
3240 '\\\n': '',
3241 '\\x': '\\u00',
3242 }.get(m.group(0), m.group(0)), v[1:-1])
3243 else:
3244 for regex, base in INTEGER_TABLE:
3245 im = re.match(regex, v)
3246 if im:
3247 i = int(im.group(1), base)
3248 return '"%d":' % i if v.endswith(':') else '%d' % i
3249
3250 if v in vars:
3251 return vars[v]
3252 if strict:
3253 raise ValueError(f'Unknown value: {v}')
3254
3255 return '"%s"' % v
3256
3257 def create_map(mobj):
3258 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3259
3260 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3261 if not strict:
3262 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3263
3264 return re.sub(r'''(?sx)
3265 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3266 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3267 {comment}|,(?={skip}[\]}}])|
3268 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3269 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3270 [0-9]+(?={skip}:)|
3271 !+
3272 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3273
3274
3275 def qualities(quality_ids):
3276 """ Get a numeric quality value out of a list of possible values """
3277 def q(qid):
3278 try:
3279 return quality_ids.index(qid)
3280 except ValueError:
3281 return -1
3282 return q
3283
3284
3285 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3286
3287
3288 DEFAULT_OUTTMPL = {
3289 'default': '%(title)s [%(id)s].%(ext)s',
3290 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3291 }
3292 OUTTMPL_TYPES = {
3293 'chapter': None,
3294 'subtitle': None,
3295 'thumbnail': None,
3296 'description': 'description',
3297 'annotation': 'annotations.xml',
3298 'infojson': 'info.json',
3299 'link': None,
3300 'pl_video': None,
3301 'pl_thumbnail': None,
3302 'pl_description': 'description',
3303 'pl_infojson': 'info.json',
3304 }
3305
3306 # As of [1] format syntax is:
3307 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3308 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3309 STR_FORMAT_RE_TMPL = r'''(?x)
3310 (?<!%)(?P<prefix>(?:%%)*)
3311 %
3312 (?P<has_key>\((?P<key>{0})\))?
3313 (?P<format>
3314 (?P<conversion>[#0\-+ ]+)?
3315 (?P<min_width>\d+)?
3316 (?P<precision>\.\d+)?
3317 (?P<len_mod>[hlL])? # unused in python
3318 {1} # conversion type
3319 )
3320 '''
3321
3322
3323 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3324
3325
3326 def limit_length(s, length):
3327 """ Add ellipses to overly long strings """
3328 if s is None:
3329 return None
3330 ELLIPSES = '...'
3331 if len(s) > length:
3332 return s[:length - len(ELLIPSES)] + ELLIPSES
3333 return s
3334
3335
3336 def version_tuple(v):
3337 return tuple(int(e) for e in re.split(r'[-.]', v))
3338
3339
3340 def is_outdated_version(version, limit, assume_new=True):
3341 if not version:
3342 return not assume_new
3343 try:
3344 return version_tuple(version) < version_tuple(limit)
3345 except ValueError:
3346 return not assume_new
3347
3348
3349 def ytdl_is_updateable():
3350 """ Returns if yt-dlp can be updated with -U """
3351
3352 from .update import is_non_updateable
3353
3354 return not is_non_updateable()
3355
3356
3357 def args_to_str(args):
3358 # Get a short string representation for a subprocess command
3359 return ' '.join(compat_shlex_quote(a) for a in args)
3360
3361
3362 def error_to_compat_str(err):
3363 return str(err)
3364
3365
3366 def error_to_str(err):
3367 return f'{type(err).__name__}: {err}'
3368
3369
3370 def mimetype2ext(mt):
3371 if mt is None:
3372 return None
3373
3374 mt, _, params = mt.partition(';')
3375 mt = mt.strip()
3376
3377 FULL_MAP = {
3378 'audio/mp4': 'm4a',
3379 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3380 # it's the most popular one
3381 'audio/mpeg': 'mp3',
3382 'audio/x-wav': 'wav',
3383 'audio/wav': 'wav',
3384 'audio/wave': 'wav',
3385 }
3386
3387 ext = FULL_MAP.get(mt)
3388 if ext is not None:
3389 return ext
3390
3391 SUBTYPE_MAP = {
3392 '3gpp': '3gp',
3393 'smptett+xml': 'tt',
3394 'ttaf+xml': 'dfxp',
3395 'ttml+xml': 'ttml',
3396 'x-flv': 'flv',
3397 'x-mp4-fragmented': 'mp4',
3398 'x-ms-sami': 'sami',
3399 'x-ms-wmv': 'wmv',
3400 'mpegurl': 'm3u8',
3401 'x-mpegurl': 'm3u8',
3402 'vnd.apple.mpegurl': 'm3u8',
3403 'dash+xml': 'mpd',
3404 'f4m+xml': 'f4m',
3405 'hds+xml': 'f4m',
3406 'vnd.ms-sstr+xml': 'ism',
3407 'quicktime': 'mov',
3408 'mp2t': 'ts',
3409 'x-wav': 'wav',
3410 'filmstrip+json': 'fs',
3411 'svg+xml': 'svg',
3412 }
3413
3414 _, _, subtype = mt.rpartition('/')
3415 ext = SUBTYPE_MAP.get(subtype.lower())
3416 if ext is not None:
3417 return ext
3418
3419 SUFFIX_MAP = {
3420 'json': 'json',
3421 'xml': 'xml',
3422 'zip': 'zip',
3423 'gzip': 'gz',
3424 }
3425
3426 _, _, suffix = subtype.partition('+')
3427 ext = SUFFIX_MAP.get(suffix)
3428 if ext is not None:
3429 return ext
3430
3431 return subtype.replace('+', '.')
3432
3433
3434 def ext2mimetype(ext_or_url):
3435 if not ext_or_url:
3436 return None
3437 if '.' not in ext_or_url:
3438 ext_or_url = f'file.{ext_or_url}'
3439 return mimetypes.guess_type(ext_or_url)[0]
3440
3441
3442 def parse_codecs(codecs_str):
3443 # http://tools.ietf.org/html/rfc6381
3444 if not codecs_str:
3445 return {}
3446 split_codecs = list(filter(None, map(
3447 str.strip, codecs_str.strip().strip(',').split(','))))
3448 vcodec, acodec, scodec, hdr = None, None, None, None
3449 for full_codec in split_codecs:
3450 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3451 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3452 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3453 if vcodec:
3454 continue
3455 vcodec = full_codec
3456 if parts[0] in ('dvh1', 'dvhe'):
3457 hdr = 'DV'
3458 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3459 hdr = 'HDR10'
3460 elif parts[:2] == ['vp9', '2']:
3461 hdr = 'HDR10'
3462 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3463 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3464 acodec = acodec or full_codec
3465 elif parts[0] in ('stpp', 'wvtt'):
3466 scodec = scodec or full_codec
3467 else:
3468 write_string(f'WARNING: Unknown codec {full_codec}\n')
3469 if vcodec or acodec or scodec:
3470 return {
3471 'vcodec': vcodec or 'none',
3472 'acodec': acodec or 'none',
3473 'dynamic_range': hdr,
3474 **({'scodec': scodec} if scodec is not None else {}),
3475 }
3476 elif len(split_codecs) == 2:
3477 return {
3478 'vcodec': split_codecs[0],
3479 'acodec': split_codecs[1],
3480 }
3481 return {}
3482
3483
3484 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3485 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3486
3487 allow_mkv = not preferences or 'mkv' in preferences
3488
3489 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3490 return 'mkv' # TODO: any other format allows this?
3491
3492 # TODO: All codecs supported by parse_codecs isn't handled here
3493 COMPATIBLE_CODECS = {
3494 'mp4': {
3495 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
3496 'h264', 'aacl', # Set in ISM
3497 },
3498 'webm': {
3499 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3500 'vp9x', 'vp8x', # in the webm spec
3501 },
3502 }
3503
3504 sanitize_codec = functools.partial(try_get, getter=lambda x: x.split('.')[0].replace('0', ''))
3505 vcodec, acodec = sanitize_codec(vcodecs[0]), sanitize_codec(acodecs[0])
3506
3507 for ext in preferences or COMPATIBLE_CODECS.keys():
3508 codec_set = COMPATIBLE_CODECS.get(ext, set())
3509 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3510 return ext
3511
3512 COMPATIBLE_EXTS = (
3513 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3514 {'webm'},
3515 )
3516 for ext in preferences or vexts:
3517 current_exts = {ext, *vexts, *aexts}
3518 if ext == 'mkv' or current_exts == {ext} or any(
3519 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3520 return ext
3521 return 'mkv' if allow_mkv else preferences[-1]
3522
3523
3524 def urlhandle_detect_ext(url_handle):
3525 getheader = url_handle.headers.get
3526
3527 cd = getheader('Content-Disposition')
3528 if cd:
3529 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3530 if m:
3531 e = determine_ext(m.group('filename'), default_ext=None)
3532 if e:
3533 return e
3534
3535 return mimetype2ext(getheader('Content-Type'))
3536
3537
3538 def encode_data_uri(data, mime_type):
3539 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3540
3541
3542 def age_restricted(content_limit, age_limit):
3543 """ Returns True iff the content should be blocked """
3544
3545 if age_limit is None: # No limit set
3546 return False
3547 if content_limit is None:
3548 return False # Content available for everyone
3549 return age_limit < content_limit
3550
3551
3552 # List of known byte-order-marks (BOM)
3553 BOMS = [
3554 (b'\xef\xbb\xbf', 'utf-8'),
3555 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3556 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3557 (b'\xff\xfe', 'utf-16-le'),
3558 (b'\xfe\xff', 'utf-16-be'),
3559 ]
3560
3561
3562 def is_html(first_bytes):
3563 """ Detect whether a file contains HTML by examining its first bytes. """
3564
3565 encoding = 'utf-8'
3566 for bom, enc in BOMS:
3567 while first_bytes.startswith(bom):
3568 encoding, first_bytes = enc, first_bytes[len(bom):]
3569
3570 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3571
3572
3573 def determine_protocol(info_dict):
3574 protocol = info_dict.get('protocol')
3575 if protocol is not None:
3576 return protocol
3577
3578 url = sanitize_url(info_dict['url'])
3579 if url.startswith('rtmp'):
3580 return 'rtmp'
3581 elif url.startswith('mms'):
3582 return 'mms'
3583 elif url.startswith('rtsp'):
3584 return 'rtsp'
3585
3586 ext = determine_ext(url)
3587 if ext == 'm3u8':
3588 return 'm3u8'
3589 elif ext == 'f4m':
3590 return 'f4m'
3591
3592 return urllib.parse.urlparse(url).scheme
3593
3594
3595 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3596 """ Render a list of rows, each as a list of values.
3597 Text after a \t will be right aligned """
3598 def width(string):
3599 return len(remove_terminal_sequences(string).replace('\t', ''))
3600
3601 def get_max_lens(table):
3602 return [max(width(str(v)) for v in col) for col in zip(*table)]
3603
3604 def filter_using_list(row, filterArray):
3605 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3606
3607 max_lens = get_max_lens(data) if hide_empty else []
3608 header_row = filter_using_list(header_row, max_lens)
3609 data = [filter_using_list(row, max_lens) for row in data]
3610
3611 table = [header_row] + data
3612 max_lens = get_max_lens(table)
3613 extra_gap += 1
3614 if delim:
3615 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3616 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3617 for row in table:
3618 for pos, text in enumerate(map(str, row)):
3619 if '\t' in text:
3620 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3621 else:
3622 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3623 ret = '\n'.join(''.join(row).rstrip() for row in table)
3624 return ret
3625
3626
3627 def _match_one(filter_part, dct, incomplete):
3628 # TODO: Generalize code with YoutubeDL._build_format_filter
3629 STRING_OPERATORS = {
3630 '*=': operator.contains,
3631 '^=': lambda attr, value: attr.startswith(value),
3632 '$=': lambda attr, value: attr.endswith(value),
3633 '~=': lambda attr, value: re.search(value, attr),
3634 }
3635 COMPARISON_OPERATORS = {
3636 **STRING_OPERATORS,
3637 '<=': operator.le, # "<=" must be defined above "<"
3638 '<': operator.lt,
3639 '>=': operator.ge,
3640 '>': operator.gt,
3641 '=': operator.eq,
3642 }
3643
3644 if isinstance(incomplete, bool):
3645 is_incomplete = lambda _: incomplete
3646 else:
3647 is_incomplete = lambda k: k in incomplete
3648
3649 operator_rex = re.compile(r'''(?x)
3650 (?P<key>[a-z_]+)
3651 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3652 (?:
3653 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3654 (?P<strval>.+?)
3655 )
3656 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3657 m = operator_rex.fullmatch(filter_part.strip())
3658 if m:
3659 m = m.groupdict()
3660 unnegated_op = COMPARISON_OPERATORS[m['op']]
3661 if m['negation']:
3662 op = lambda attr, value: not unnegated_op(attr, value)
3663 else:
3664 op = unnegated_op
3665 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3666 if m['quote']:
3667 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3668 actual_value = dct.get(m['key'])
3669 numeric_comparison = None
3670 if isinstance(actual_value, (int, float)):
3671 # If the original field is a string and matching comparisonvalue is
3672 # a number we should respect the origin of the original field
3673 # and process comparison value as a string (see
3674 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3675 try:
3676 numeric_comparison = int(comparison_value)
3677 except ValueError:
3678 numeric_comparison = parse_filesize(comparison_value)
3679 if numeric_comparison is None:
3680 numeric_comparison = parse_filesize(f'{comparison_value}B')
3681 if numeric_comparison is None:
3682 numeric_comparison = parse_duration(comparison_value)
3683 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3684 raise ValueError('Operator %s only supports string values!' % m['op'])
3685 if actual_value is None:
3686 return is_incomplete(m['key']) or m['none_inclusive']
3687 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3688
3689 UNARY_OPERATORS = {
3690 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3691 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3692 }
3693 operator_rex = re.compile(r'''(?x)
3694 (?P<op>%s)\s*(?P<key>[a-z_]+)
3695 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3696 m = operator_rex.fullmatch(filter_part.strip())
3697 if m:
3698 op = UNARY_OPERATORS[m.group('op')]
3699 actual_value = dct.get(m.group('key'))
3700 if is_incomplete(m.group('key')) and actual_value is None:
3701 return True
3702 return op(actual_value)
3703
3704 raise ValueError('Invalid filter part %r' % filter_part)
3705
3706
3707 def match_str(filter_str, dct, incomplete=False):
3708 """ Filter a dictionary with a simple string syntax.
3709 @returns Whether the filter passes
3710 @param incomplete Set of keys that is expected to be missing from dct.
3711 Can be True/False to indicate all/none of the keys may be missing.
3712 All conditions on incomplete keys pass if the key is missing
3713 """
3714 return all(
3715 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3716 for filter_part in re.split(r'(?<!\\)&', filter_str))
3717
3718
3719 def match_filter_func(filters):
3720 if not filters:
3721 return None
3722 filters = set(variadic(filters))
3723
3724 interactive = '-' in filters
3725 if interactive:
3726 filters.remove('-')
3727
3728 def _match_func(info_dict, incomplete=False):
3729 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3730 return NO_DEFAULT if interactive and not incomplete else None
3731 else:
3732 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3733 filter_str = ') | ('.join(map(str.strip, filters))
3734 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3735 return _match_func
3736
3737
3738 class download_range_func:
3739 def __init__(self, chapters, ranges):
3740 self.chapters, self.ranges = chapters, ranges
3741
3742 def __call__(self, info_dict, ydl):
3743 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3744 else 'Cannot match chapters since chapter information is unavailable')
3745 for regex in self.chapters or []:
3746 for i, chapter in enumerate(info_dict.get('chapters') or []):
3747 if re.search(regex, chapter['title']):
3748 warning = None
3749 yield {**chapter, 'index': i}
3750 if self.chapters and warning:
3751 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3752
3753 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3754
3755 def __eq__(self, other):
3756 return (isinstance(other, download_range_func)
3757 and self.chapters == other.chapters and self.ranges == other.ranges)
3758
3759
3760 def parse_dfxp_time_expr(time_expr):
3761 if not time_expr:
3762 return
3763
3764 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3765 if mobj:
3766 return float(mobj.group('time_offset'))
3767
3768 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3769 if mobj:
3770 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3771
3772
3773 def srt_subtitles_timecode(seconds):
3774 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3775
3776
3777 def ass_subtitles_timecode(seconds):
3778 time = timetuple_from_msec(seconds * 1000)
3779 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3780
3781
3782 def dfxp2srt(dfxp_data):
3783 '''
3784 @param dfxp_data A bytes-like object containing DFXP data
3785 @returns A unicode object containing converted SRT data
3786 '''
3787 LEGACY_NAMESPACES = (
3788 (b'http://www.w3.org/ns/ttml', [
3789 b'http://www.w3.org/2004/11/ttaf1',
3790 b'http://www.w3.org/2006/04/ttaf1',
3791 b'http://www.w3.org/2006/10/ttaf1',
3792 ]),
3793 (b'http://www.w3.org/ns/ttml#styling', [
3794 b'http://www.w3.org/ns/ttml#style',
3795 ]),
3796 )
3797
3798 SUPPORTED_STYLING = [
3799 'color',
3800 'fontFamily',
3801 'fontSize',
3802 'fontStyle',
3803 'fontWeight',
3804 'textDecoration'
3805 ]
3806
3807 _x = functools.partial(xpath_with_ns, ns_map={
3808 'xml': 'http://www.w3.org/XML/1998/namespace',
3809 'ttml': 'http://www.w3.org/ns/ttml',
3810 'tts': 'http://www.w3.org/ns/ttml#styling',
3811 })
3812
3813 styles = {}
3814 default_style = {}
3815
3816 class TTMLPElementParser:
3817 _out = ''
3818 _unclosed_elements = []
3819 _applied_styles = []
3820
3821 def start(self, tag, attrib):
3822 if tag in (_x('ttml:br'), 'br'):
3823 self._out += '\n'
3824 else:
3825 unclosed_elements = []
3826 style = {}
3827 element_style_id = attrib.get('style')
3828 if default_style:
3829 style.update(default_style)
3830 if element_style_id:
3831 style.update(styles.get(element_style_id, {}))
3832 for prop in SUPPORTED_STYLING:
3833 prop_val = attrib.get(_x('tts:' + prop))
3834 if prop_val:
3835 style[prop] = prop_val
3836 if style:
3837 font = ''
3838 for k, v in sorted(style.items()):
3839 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3840 continue
3841 if k == 'color':
3842 font += ' color="%s"' % v
3843 elif k == 'fontSize':
3844 font += ' size="%s"' % v
3845 elif k == 'fontFamily':
3846 font += ' face="%s"' % v
3847 elif k == 'fontWeight' and v == 'bold':
3848 self._out += '<b>'
3849 unclosed_elements.append('b')
3850 elif k == 'fontStyle' and v == 'italic':
3851 self._out += '<i>'
3852 unclosed_elements.append('i')
3853 elif k == 'textDecoration' and v == 'underline':
3854 self._out += '<u>'
3855 unclosed_elements.append('u')
3856 if font:
3857 self._out += '<font' + font + '>'
3858 unclosed_elements.append('font')
3859 applied_style = {}
3860 if self._applied_styles:
3861 applied_style.update(self._applied_styles[-1])
3862 applied_style.update(style)
3863 self._applied_styles.append(applied_style)
3864 self._unclosed_elements.append(unclosed_elements)
3865
3866 def end(self, tag):
3867 if tag not in (_x('ttml:br'), 'br'):
3868 unclosed_elements = self._unclosed_elements.pop()
3869 for element in reversed(unclosed_elements):
3870 self._out += '</%s>' % element
3871 if unclosed_elements and self._applied_styles:
3872 self._applied_styles.pop()
3873
3874 def data(self, data):
3875 self._out += data
3876
3877 def close(self):
3878 return self._out.strip()
3879
3880 def parse_node(node):
3881 target = TTMLPElementParser()
3882 parser = xml.etree.ElementTree.XMLParser(target=target)
3883 parser.feed(xml.etree.ElementTree.tostring(node))
3884 return parser.close()
3885
3886 for k, v in LEGACY_NAMESPACES:
3887 for ns in v:
3888 dfxp_data = dfxp_data.replace(ns, k)
3889
3890 dfxp = compat_etree_fromstring(dfxp_data)
3891 out = []
3892 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3893
3894 if not paras:
3895 raise ValueError('Invalid dfxp/TTML subtitle')
3896
3897 repeat = False
3898 while True:
3899 for style in dfxp.findall(_x('.//ttml:style')):
3900 style_id = style.get('id') or style.get(_x('xml:id'))
3901 if not style_id:
3902 continue
3903 parent_style_id = style.get('style')
3904 if parent_style_id:
3905 if parent_style_id not in styles:
3906 repeat = True
3907 continue
3908 styles[style_id] = styles[parent_style_id].copy()
3909 for prop in SUPPORTED_STYLING:
3910 prop_val = style.get(_x('tts:' + prop))
3911 if prop_val:
3912 styles.setdefault(style_id, {})[prop] = prop_val
3913 if repeat:
3914 repeat = False
3915 else:
3916 break
3917
3918 for p in ('body', 'div'):
3919 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3920 if ele is None:
3921 continue
3922 style = styles.get(ele.get('style'))
3923 if not style:
3924 continue
3925 default_style.update(style)
3926
3927 for para, index in zip(paras, itertools.count(1)):
3928 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3929 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3930 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3931 if begin_time is None:
3932 continue
3933 if not end_time:
3934 if not dur:
3935 continue
3936 end_time = begin_time + dur
3937 out.append('%d\n%s --> %s\n%s\n\n' % (
3938 index,
3939 srt_subtitles_timecode(begin_time),
3940 srt_subtitles_timecode(end_time),
3941 parse_node(para)))
3942
3943 return ''.join(out)
3944
3945
3946 def cli_option(params, command_option, param, separator=None):
3947 param = params.get(param)
3948 return ([] if param is None
3949 else [command_option, str(param)] if separator is None
3950 else [f'{command_option}{separator}{param}'])
3951
3952
3953 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3954 param = params.get(param)
3955 assert param in (True, False, None)
3956 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
3957
3958
3959 def cli_valueless_option(params, command_option, param, expected_value=True):
3960 return [command_option] if params.get(param) == expected_value else []
3961
3962
3963 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3964 if isinstance(argdict, (list, tuple)): # for backward compatibility
3965 if use_compat:
3966 return argdict
3967 else:
3968 argdict = None
3969 if argdict is None:
3970 return default
3971 assert isinstance(argdict, dict)
3972
3973 assert isinstance(keys, (list, tuple))
3974 for key_list in keys:
3975 arg_list = list(filter(
3976 lambda x: x is not None,
3977 [argdict.get(key.lower()) for key in variadic(key_list)]))
3978 if arg_list:
3979 return [arg for args in arg_list for arg in args]
3980 return default
3981
3982
3983 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3984 main_key, exe = main_key.lower(), exe.lower()
3985 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3986 keys = [f'{root_key}{k}' for k in (keys or [''])]
3987 if root_key in keys:
3988 if main_key != exe:
3989 keys.append((main_key, exe))
3990 keys.append('default')
3991 else:
3992 use_compat = False
3993 return cli_configuration_args(argdict, keys, default, use_compat)
3994
3995
3996 class ISO639Utils:
3997 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3998 _lang_map = {
3999 'aa': 'aar',
4000 'ab': 'abk',
4001 'ae': 'ave',
4002 'af': 'afr',
4003 'ak': 'aka',
4004 'am': 'amh',
4005 'an': 'arg',
4006 'ar': 'ara',
4007 'as': 'asm',
4008 'av': 'ava',
4009 'ay': 'aym',
4010 'az': 'aze',
4011 'ba': 'bak',
4012 'be': 'bel',
4013 'bg': 'bul',
4014 'bh': 'bih',
4015 'bi': 'bis',
4016 'bm': 'bam',
4017 'bn': 'ben',
4018 'bo': 'bod',
4019 'br': 'bre',
4020 'bs': 'bos',
4021 'ca': 'cat',
4022 'ce': 'che',
4023 'ch': 'cha',
4024 'co': 'cos',
4025 'cr': 'cre',
4026 'cs': 'ces',
4027 'cu': 'chu',
4028 'cv': 'chv',
4029 'cy': 'cym',
4030 'da': 'dan',
4031 'de': 'deu',
4032 'dv': 'div',
4033 'dz': 'dzo',
4034 'ee': 'ewe',
4035 'el': 'ell',
4036 'en': 'eng',
4037 'eo': 'epo',
4038 'es': 'spa',
4039 'et': 'est',
4040 'eu': 'eus',
4041 'fa': 'fas',
4042 'ff': 'ful',
4043 'fi': 'fin',
4044 'fj': 'fij',
4045 'fo': 'fao',
4046 'fr': 'fra',
4047 'fy': 'fry',
4048 'ga': 'gle',
4049 'gd': 'gla',
4050 'gl': 'glg',
4051 'gn': 'grn',
4052 'gu': 'guj',
4053 'gv': 'glv',
4054 'ha': 'hau',
4055 'he': 'heb',
4056 'iw': 'heb', # Replaced by he in 1989 revision
4057 'hi': 'hin',
4058 'ho': 'hmo',
4059 'hr': 'hrv',
4060 'ht': 'hat',
4061 'hu': 'hun',
4062 'hy': 'hye',
4063 'hz': 'her',
4064 'ia': 'ina',
4065 'id': 'ind',
4066 'in': 'ind', # Replaced by id in 1989 revision
4067 'ie': 'ile',
4068 'ig': 'ibo',
4069 'ii': 'iii',
4070 'ik': 'ipk',
4071 'io': 'ido',
4072 'is': 'isl',
4073 'it': 'ita',
4074 'iu': 'iku',
4075 'ja': 'jpn',
4076 'jv': 'jav',
4077 'ka': 'kat',
4078 'kg': 'kon',
4079 'ki': 'kik',
4080 'kj': 'kua',
4081 'kk': 'kaz',
4082 'kl': 'kal',
4083 'km': 'khm',
4084 'kn': 'kan',
4085 'ko': 'kor',
4086 'kr': 'kau',
4087 'ks': 'kas',
4088 'ku': 'kur',
4089 'kv': 'kom',
4090 'kw': 'cor',
4091 'ky': 'kir',
4092 'la': 'lat',
4093 'lb': 'ltz',
4094 'lg': 'lug',
4095 'li': 'lim',
4096 'ln': 'lin',
4097 'lo': 'lao',
4098 'lt': 'lit',
4099 'lu': 'lub',
4100 'lv': 'lav',
4101 'mg': 'mlg',
4102 'mh': 'mah',
4103 'mi': 'mri',
4104 'mk': 'mkd',
4105 'ml': 'mal',
4106 'mn': 'mon',
4107 'mr': 'mar',
4108 'ms': 'msa',
4109 'mt': 'mlt',
4110 'my': 'mya',
4111 'na': 'nau',
4112 'nb': 'nob',
4113 'nd': 'nde',
4114 'ne': 'nep',
4115 'ng': 'ndo',
4116 'nl': 'nld',
4117 'nn': 'nno',
4118 'no': 'nor',
4119 'nr': 'nbl',
4120 'nv': 'nav',
4121 'ny': 'nya',
4122 'oc': 'oci',
4123 'oj': 'oji',
4124 'om': 'orm',
4125 'or': 'ori',
4126 'os': 'oss',
4127 'pa': 'pan',
4128 'pi': 'pli',
4129 'pl': 'pol',
4130 'ps': 'pus',
4131 'pt': 'por',
4132 'qu': 'que',
4133 'rm': 'roh',
4134 'rn': 'run',
4135 'ro': 'ron',
4136 'ru': 'rus',
4137 'rw': 'kin',
4138 'sa': 'san',
4139 'sc': 'srd',
4140 'sd': 'snd',
4141 'se': 'sme',
4142 'sg': 'sag',
4143 'si': 'sin',
4144 'sk': 'slk',
4145 'sl': 'slv',
4146 'sm': 'smo',
4147 'sn': 'sna',
4148 'so': 'som',
4149 'sq': 'sqi',
4150 'sr': 'srp',
4151 'ss': 'ssw',
4152 'st': 'sot',
4153 'su': 'sun',
4154 'sv': 'swe',
4155 'sw': 'swa',
4156 'ta': 'tam',
4157 'te': 'tel',
4158 'tg': 'tgk',
4159 'th': 'tha',
4160 'ti': 'tir',
4161 'tk': 'tuk',
4162 'tl': 'tgl',
4163 'tn': 'tsn',
4164 'to': 'ton',
4165 'tr': 'tur',
4166 'ts': 'tso',
4167 'tt': 'tat',
4168 'tw': 'twi',
4169 'ty': 'tah',
4170 'ug': 'uig',
4171 'uk': 'ukr',
4172 'ur': 'urd',
4173 'uz': 'uzb',
4174 've': 'ven',
4175 'vi': 'vie',
4176 'vo': 'vol',
4177 'wa': 'wln',
4178 'wo': 'wol',
4179 'xh': 'xho',
4180 'yi': 'yid',
4181 'ji': 'yid', # Replaced by yi in 1989 revision
4182 'yo': 'yor',
4183 'za': 'zha',
4184 'zh': 'zho',
4185 'zu': 'zul',
4186 }
4187
4188 @classmethod
4189 def short2long(cls, code):
4190 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4191 return cls._lang_map.get(code[:2])
4192
4193 @classmethod
4194 def long2short(cls, code):
4195 """Convert language code from ISO 639-2/T to ISO 639-1"""
4196 for short_name, long_name in cls._lang_map.items():
4197 if long_name == code:
4198 return short_name
4199
4200
4201 class ISO3166Utils:
4202 # From http://data.okfn.org/data/core/country-list
4203 _country_map = {
4204 'AF': 'Afghanistan',
4205 'AX': 'Åland Islands',
4206 'AL': 'Albania',
4207 'DZ': 'Algeria',
4208 'AS': 'American Samoa',
4209 'AD': 'Andorra',
4210 'AO': 'Angola',
4211 'AI': 'Anguilla',
4212 'AQ': 'Antarctica',
4213 'AG': 'Antigua and Barbuda',
4214 'AR': 'Argentina',
4215 'AM': 'Armenia',
4216 'AW': 'Aruba',
4217 'AU': 'Australia',
4218 'AT': 'Austria',
4219 'AZ': 'Azerbaijan',
4220 'BS': 'Bahamas',
4221 'BH': 'Bahrain',
4222 'BD': 'Bangladesh',
4223 'BB': 'Barbados',
4224 'BY': 'Belarus',
4225 'BE': 'Belgium',
4226 'BZ': 'Belize',
4227 'BJ': 'Benin',
4228 'BM': 'Bermuda',
4229 'BT': 'Bhutan',
4230 'BO': 'Bolivia, Plurinational State of',
4231 'BQ': 'Bonaire, Sint Eustatius and Saba',
4232 'BA': 'Bosnia and Herzegovina',
4233 'BW': 'Botswana',
4234 'BV': 'Bouvet Island',
4235 'BR': 'Brazil',
4236 'IO': 'British Indian Ocean Territory',
4237 'BN': 'Brunei Darussalam',
4238 'BG': 'Bulgaria',
4239 'BF': 'Burkina Faso',
4240 'BI': 'Burundi',
4241 'KH': 'Cambodia',
4242 'CM': 'Cameroon',
4243 'CA': 'Canada',
4244 'CV': 'Cape Verde',
4245 'KY': 'Cayman Islands',
4246 'CF': 'Central African Republic',
4247 'TD': 'Chad',
4248 'CL': 'Chile',
4249 'CN': 'China',
4250 'CX': 'Christmas Island',
4251 'CC': 'Cocos (Keeling) Islands',
4252 'CO': 'Colombia',
4253 'KM': 'Comoros',
4254 'CG': 'Congo',
4255 'CD': 'Congo, the Democratic Republic of the',
4256 'CK': 'Cook Islands',
4257 'CR': 'Costa Rica',
4258 'CI': 'Côte d\'Ivoire',
4259 'HR': 'Croatia',
4260 'CU': 'Cuba',
4261 'CW': 'Curaçao',
4262 'CY': 'Cyprus',
4263 'CZ': 'Czech Republic',
4264 'DK': 'Denmark',
4265 'DJ': 'Djibouti',
4266 'DM': 'Dominica',
4267 'DO': 'Dominican Republic',
4268 'EC': 'Ecuador',
4269 'EG': 'Egypt',
4270 'SV': 'El Salvador',
4271 'GQ': 'Equatorial Guinea',
4272 'ER': 'Eritrea',
4273 'EE': 'Estonia',
4274 'ET': 'Ethiopia',
4275 'FK': 'Falkland Islands (Malvinas)',
4276 'FO': 'Faroe Islands',
4277 'FJ': 'Fiji',
4278 'FI': 'Finland',
4279 'FR': 'France',
4280 'GF': 'French Guiana',
4281 'PF': 'French Polynesia',
4282 'TF': 'French Southern Territories',
4283 'GA': 'Gabon',
4284 'GM': 'Gambia',
4285 'GE': 'Georgia',
4286 'DE': 'Germany',
4287 'GH': 'Ghana',
4288 'GI': 'Gibraltar',
4289 'GR': 'Greece',
4290 'GL': 'Greenland',
4291 'GD': 'Grenada',
4292 'GP': 'Guadeloupe',
4293 'GU': 'Guam',
4294 'GT': 'Guatemala',
4295 'GG': 'Guernsey',
4296 'GN': 'Guinea',
4297 'GW': 'Guinea-Bissau',
4298 'GY': 'Guyana',
4299 'HT': 'Haiti',
4300 'HM': 'Heard Island and McDonald Islands',
4301 'VA': 'Holy See (Vatican City State)',
4302 'HN': 'Honduras',
4303 'HK': 'Hong Kong',
4304 'HU': 'Hungary',
4305 'IS': 'Iceland',
4306 'IN': 'India',
4307 'ID': 'Indonesia',
4308 'IR': 'Iran, Islamic Republic of',
4309 'IQ': 'Iraq',
4310 'IE': 'Ireland',
4311 'IM': 'Isle of Man',
4312 'IL': 'Israel',
4313 'IT': 'Italy',
4314 'JM': 'Jamaica',
4315 'JP': 'Japan',
4316 'JE': 'Jersey',
4317 'JO': 'Jordan',
4318 'KZ': 'Kazakhstan',
4319 'KE': 'Kenya',
4320 'KI': 'Kiribati',
4321 'KP': 'Korea, Democratic People\'s Republic of',
4322 'KR': 'Korea, Republic of',
4323 'KW': 'Kuwait',
4324 'KG': 'Kyrgyzstan',
4325 'LA': 'Lao People\'s Democratic Republic',
4326 'LV': 'Latvia',
4327 'LB': 'Lebanon',
4328 'LS': 'Lesotho',
4329 'LR': 'Liberia',
4330 'LY': 'Libya',
4331 'LI': 'Liechtenstein',
4332 'LT': 'Lithuania',
4333 'LU': 'Luxembourg',
4334 'MO': 'Macao',
4335 'MK': 'Macedonia, the Former Yugoslav Republic of',
4336 'MG': 'Madagascar',
4337 'MW': 'Malawi',
4338 'MY': 'Malaysia',
4339 'MV': 'Maldives',
4340 'ML': 'Mali',
4341 'MT': 'Malta',
4342 'MH': 'Marshall Islands',
4343 'MQ': 'Martinique',
4344 'MR': 'Mauritania',
4345 'MU': 'Mauritius',
4346 'YT': 'Mayotte',
4347 'MX': 'Mexico',
4348 'FM': 'Micronesia, Federated States of',
4349 'MD': 'Moldova, Republic of',
4350 'MC': 'Monaco',
4351 'MN': 'Mongolia',
4352 'ME': 'Montenegro',
4353 'MS': 'Montserrat',
4354 'MA': 'Morocco',
4355 'MZ': 'Mozambique',
4356 'MM': 'Myanmar',
4357 'NA': 'Namibia',
4358 'NR': 'Nauru',
4359 'NP': 'Nepal',
4360 'NL': 'Netherlands',
4361 'NC': 'New Caledonia',
4362 'NZ': 'New Zealand',
4363 'NI': 'Nicaragua',
4364 'NE': 'Niger',
4365 'NG': 'Nigeria',
4366 'NU': 'Niue',
4367 'NF': 'Norfolk Island',
4368 'MP': 'Northern Mariana Islands',
4369 'NO': 'Norway',
4370 'OM': 'Oman',
4371 'PK': 'Pakistan',
4372 'PW': 'Palau',
4373 'PS': 'Palestine, State of',
4374 'PA': 'Panama',
4375 'PG': 'Papua New Guinea',
4376 'PY': 'Paraguay',
4377 'PE': 'Peru',
4378 'PH': 'Philippines',
4379 'PN': 'Pitcairn',
4380 'PL': 'Poland',
4381 'PT': 'Portugal',
4382 'PR': 'Puerto Rico',
4383 'QA': 'Qatar',
4384 'RE': 'Réunion',
4385 'RO': 'Romania',
4386 'RU': 'Russian Federation',
4387 'RW': 'Rwanda',
4388 'BL': 'Saint Barthélemy',
4389 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4390 'KN': 'Saint Kitts and Nevis',
4391 'LC': 'Saint Lucia',
4392 'MF': 'Saint Martin (French part)',
4393 'PM': 'Saint Pierre and Miquelon',
4394 'VC': 'Saint Vincent and the Grenadines',
4395 'WS': 'Samoa',
4396 'SM': 'San Marino',
4397 'ST': 'Sao Tome and Principe',
4398 'SA': 'Saudi Arabia',
4399 'SN': 'Senegal',
4400 'RS': 'Serbia',
4401 'SC': 'Seychelles',
4402 'SL': 'Sierra Leone',
4403 'SG': 'Singapore',
4404 'SX': 'Sint Maarten (Dutch part)',
4405 'SK': 'Slovakia',
4406 'SI': 'Slovenia',
4407 'SB': 'Solomon Islands',
4408 'SO': 'Somalia',
4409 'ZA': 'South Africa',
4410 'GS': 'South Georgia and the South Sandwich Islands',
4411 'SS': 'South Sudan',
4412 'ES': 'Spain',
4413 'LK': 'Sri Lanka',
4414 'SD': 'Sudan',
4415 'SR': 'Suriname',
4416 'SJ': 'Svalbard and Jan Mayen',
4417 'SZ': 'Swaziland',
4418 'SE': 'Sweden',
4419 'CH': 'Switzerland',
4420 'SY': 'Syrian Arab Republic',
4421 'TW': 'Taiwan, Province of China',
4422 'TJ': 'Tajikistan',
4423 'TZ': 'Tanzania, United Republic of',
4424 'TH': 'Thailand',
4425 'TL': 'Timor-Leste',
4426 'TG': 'Togo',
4427 'TK': 'Tokelau',
4428 'TO': 'Tonga',
4429 'TT': 'Trinidad and Tobago',
4430 'TN': 'Tunisia',
4431 'TR': 'Turkey',
4432 'TM': 'Turkmenistan',
4433 'TC': 'Turks and Caicos Islands',
4434 'TV': 'Tuvalu',
4435 'UG': 'Uganda',
4436 'UA': 'Ukraine',
4437 'AE': 'United Arab Emirates',
4438 'GB': 'United Kingdom',
4439 'US': 'United States',
4440 'UM': 'United States Minor Outlying Islands',
4441 'UY': 'Uruguay',
4442 'UZ': 'Uzbekistan',
4443 'VU': 'Vanuatu',
4444 'VE': 'Venezuela, Bolivarian Republic of',
4445 'VN': 'Viet Nam',
4446 'VG': 'Virgin Islands, British',
4447 'VI': 'Virgin Islands, U.S.',
4448 'WF': 'Wallis and Futuna',
4449 'EH': 'Western Sahara',
4450 'YE': 'Yemen',
4451 'ZM': 'Zambia',
4452 'ZW': 'Zimbabwe',
4453 # Not ISO 3166 codes, but used for IP blocks
4454 'AP': 'Asia/Pacific Region',
4455 'EU': 'Europe',
4456 }
4457
4458 @classmethod
4459 def short2full(cls, code):
4460 """Convert an ISO 3166-2 country code to the corresponding full name"""
4461 return cls._country_map.get(code.upper())
4462
4463
4464 class GeoUtils:
4465 # Major IPv4 address blocks per country
4466 _country_ip_map = {
4467 'AD': '46.172.224.0/19',
4468 'AE': '94.200.0.0/13',
4469 'AF': '149.54.0.0/17',
4470 'AG': '209.59.64.0/18',
4471 'AI': '204.14.248.0/21',
4472 'AL': '46.99.0.0/16',
4473 'AM': '46.70.0.0/15',
4474 'AO': '105.168.0.0/13',
4475 'AP': '182.50.184.0/21',
4476 'AQ': '23.154.160.0/24',
4477 'AR': '181.0.0.0/12',
4478 'AS': '202.70.112.0/20',
4479 'AT': '77.116.0.0/14',
4480 'AU': '1.128.0.0/11',
4481 'AW': '181.41.0.0/18',
4482 'AX': '185.217.4.0/22',
4483 'AZ': '5.197.0.0/16',
4484 'BA': '31.176.128.0/17',
4485 'BB': '65.48.128.0/17',
4486 'BD': '114.130.0.0/16',
4487 'BE': '57.0.0.0/8',
4488 'BF': '102.178.0.0/15',
4489 'BG': '95.42.0.0/15',
4490 'BH': '37.131.0.0/17',
4491 'BI': '154.117.192.0/18',
4492 'BJ': '137.255.0.0/16',
4493 'BL': '185.212.72.0/23',
4494 'BM': '196.12.64.0/18',
4495 'BN': '156.31.0.0/16',
4496 'BO': '161.56.0.0/16',
4497 'BQ': '161.0.80.0/20',
4498 'BR': '191.128.0.0/12',
4499 'BS': '24.51.64.0/18',
4500 'BT': '119.2.96.0/19',
4501 'BW': '168.167.0.0/16',
4502 'BY': '178.120.0.0/13',
4503 'BZ': '179.42.192.0/18',
4504 'CA': '99.224.0.0/11',
4505 'CD': '41.243.0.0/16',
4506 'CF': '197.242.176.0/21',
4507 'CG': '160.113.0.0/16',
4508 'CH': '85.0.0.0/13',
4509 'CI': '102.136.0.0/14',
4510 'CK': '202.65.32.0/19',
4511 'CL': '152.172.0.0/14',
4512 'CM': '102.244.0.0/14',
4513 'CN': '36.128.0.0/10',
4514 'CO': '181.240.0.0/12',
4515 'CR': '201.192.0.0/12',
4516 'CU': '152.206.0.0/15',
4517 'CV': '165.90.96.0/19',
4518 'CW': '190.88.128.0/17',
4519 'CY': '31.153.0.0/16',
4520 'CZ': '88.100.0.0/14',
4521 'DE': '53.0.0.0/8',
4522 'DJ': '197.241.0.0/17',
4523 'DK': '87.48.0.0/12',
4524 'DM': '192.243.48.0/20',
4525 'DO': '152.166.0.0/15',
4526 'DZ': '41.96.0.0/12',
4527 'EC': '186.68.0.0/15',
4528 'EE': '90.190.0.0/15',
4529 'EG': '156.160.0.0/11',
4530 'ER': '196.200.96.0/20',
4531 'ES': '88.0.0.0/11',
4532 'ET': '196.188.0.0/14',
4533 'EU': '2.16.0.0/13',
4534 'FI': '91.152.0.0/13',
4535 'FJ': '144.120.0.0/16',
4536 'FK': '80.73.208.0/21',
4537 'FM': '119.252.112.0/20',
4538 'FO': '88.85.32.0/19',
4539 'FR': '90.0.0.0/9',
4540 'GA': '41.158.0.0/15',
4541 'GB': '25.0.0.0/8',
4542 'GD': '74.122.88.0/21',
4543 'GE': '31.146.0.0/16',
4544 'GF': '161.22.64.0/18',
4545 'GG': '62.68.160.0/19',
4546 'GH': '154.160.0.0/12',
4547 'GI': '95.164.0.0/16',
4548 'GL': '88.83.0.0/19',
4549 'GM': '160.182.0.0/15',
4550 'GN': '197.149.192.0/18',
4551 'GP': '104.250.0.0/19',
4552 'GQ': '105.235.224.0/20',
4553 'GR': '94.64.0.0/13',
4554 'GT': '168.234.0.0/16',
4555 'GU': '168.123.0.0/16',
4556 'GW': '197.214.80.0/20',
4557 'GY': '181.41.64.0/18',
4558 'HK': '113.252.0.0/14',
4559 'HN': '181.210.0.0/16',
4560 'HR': '93.136.0.0/13',
4561 'HT': '148.102.128.0/17',
4562 'HU': '84.0.0.0/14',
4563 'ID': '39.192.0.0/10',
4564 'IE': '87.32.0.0/12',
4565 'IL': '79.176.0.0/13',
4566 'IM': '5.62.80.0/20',
4567 'IN': '117.192.0.0/10',
4568 'IO': '203.83.48.0/21',
4569 'IQ': '37.236.0.0/14',
4570 'IR': '2.176.0.0/12',
4571 'IS': '82.221.0.0/16',
4572 'IT': '79.0.0.0/10',
4573 'JE': '87.244.64.0/18',
4574 'JM': '72.27.0.0/17',
4575 'JO': '176.29.0.0/16',
4576 'JP': '133.0.0.0/8',
4577 'KE': '105.48.0.0/12',
4578 'KG': '158.181.128.0/17',
4579 'KH': '36.37.128.0/17',
4580 'KI': '103.25.140.0/22',
4581 'KM': '197.255.224.0/20',
4582 'KN': '198.167.192.0/19',
4583 'KP': '175.45.176.0/22',
4584 'KR': '175.192.0.0/10',
4585 'KW': '37.36.0.0/14',
4586 'KY': '64.96.0.0/15',
4587 'KZ': '2.72.0.0/13',
4588 'LA': '115.84.64.0/18',
4589 'LB': '178.135.0.0/16',
4590 'LC': '24.92.144.0/20',
4591 'LI': '82.117.0.0/19',
4592 'LK': '112.134.0.0/15',
4593 'LR': '102.183.0.0/16',
4594 'LS': '129.232.0.0/17',
4595 'LT': '78.56.0.0/13',
4596 'LU': '188.42.0.0/16',
4597 'LV': '46.109.0.0/16',
4598 'LY': '41.252.0.0/14',
4599 'MA': '105.128.0.0/11',
4600 'MC': '88.209.64.0/18',
4601 'MD': '37.246.0.0/16',
4602 'ME': '178.175.0.0/17',
4603 'MF': '74.112.232.0/21',
4604 'MG': '154.126.0.0/17',
4605 'MH': '117.103.88.0/21',
4606 'MK': '77.28.0.0/15',
4607 'ML': '154.118.128.0/18',
4608 'MM': '37.111.0.0/17',
4609 'MN': '49.0.128.0/17',
4610 'MO': '60.246.0.0/16',
4611 'MP': '202.88.64.0/20',
4612 'MQ': '109.203.224.0/19',
4613 'MR': '41.188.64.0/18',
4614 'MS': '208.90.112.0/22',
4615 'MT': '46.11.0.0/16',
4616 'MU': '105.16.0.0/12',
4617 'MV': '27.114.128.0/18',
4618 'MW': '102.70.0.0/15',
4619 'MX': '187.192.0.0/11',
4620 'MY': '175.136.0.0/13',
4621 'MZ': '197.218.0.0/15',
4622 'NA': '41.182.0.0/16',
4623 'NC': '101.101.0.0/18',
4624 'NE': '197.214.0.0/18',
4625 'NF': '203.17.240.0/22',
4626 'NG': '105.112.0.0/12',
4627 'NI': '186.76.0.0/15',
4628 'NL': '145.96.0.0/11',
4629 'NO': '84.208.0.0/13',
4630 'NP': '36.252.0.0/15',
4631 'NR': '203.98.224.0/19',
4632 'NU': '49.156.48.0/22',
4633 'NZ': '49.224.0.0/14',
4634 'OM': '5.36.0.0/15',
4635 'PA': '186.72.0.0/15',
4636 'PE': '186.160.0.0/14',
4637 'PF': '123.50.64.0/18',
4638 'PG': '124.240.192.0/19',
4639 'PH': '49.144.0.0/13',
4640 'PK': '39.32.0.0/11',
4641 'PL': '83.0.0.0/11',
4642 'PM': '70.36.0.0/20',
4643 'PR': '66.50.0.0/16',
4644 'PS': '188.161.0.0/16',
4645 'PT': '85.240.0.0/13',
4646 'PW': '202.124.224.0/20',
4647 'PY': '181.120.0.0/14',
4648 'QA': '37.210.0.0/15',
4649 'RE': '102.35.0.0/16',
4650 'RO': '79.112.0.0/13',
4651 'RS': '93.86.0.0/15',
4652 'RU': '5.136.0.0/13',
4653 'RW': '41.186.0.0/16',
4654 'SA': '188.48.0.0/13',
4655 'SB': '202.1.160.0/19',
4656 'SC': '154.192.0.0/11',
4657 'SD': '102.120.0.0/13',
4658 'SE': '78.64.0.0/12',
4659 'SG': '8.128.0.0/10',
4660 'SI': '188.196.0.0/14',
4661 'SK': '78.98.0.0/15',
4662 'SL': '102.143.0.0/17',
4663 'SM': '89.186.32.0/19',
4664 'SN': '41.82.0.0/15',
4665 'SO': '154.115.192.0/18',
4666 'SR': '186.179.128.0/17',
4667 'SS': '105.235.208.0/21',
4668 'ST': '197.159.160.0/19',
4669 'SV': '168.243.0.0/16',
4670 'SX': '190.102.0.0/20',
4671 'SY': '5.0.0.0/16',
4672 'SZ': '41.84.224.0/19',
4673 'TC': '65.255.48.0/20',
4674 'TD': '154.68.128.0/19',
4675 'TG': '196.168.0.0/14',
4676 'TH': '171.96.0.0/13',
4677 'TJ': '85.9.128.0/18',
4678 'TK': '27.96.24.0/21',
4679 'TL': '180.189.160.0/20',
4680 'TM': '95.85.96.0/19',
4681 'TN': '197.0.0.0/11',
4682 'TO': '175.176.144.0/21',
4683 'TR': '78.160.0.0/11',
4684 'TT': '186.44.0.0/15',
4685 'TV': '202.2.96.0/19',
4686 'TW': '120.96.0.0/11',
4687 'TZ': '156.156.0.0/14',
4688 'UA': '37.52.0.0/14',
4689 'UG': '102.80.0.0/13',
4690 'US': '6.0.0.0/8',
4691 'UY': '167.56.0.0/13',
4692 'UZ': '84.54.64.0/18',
4693 'VA': '212.77.0.0/19',
4694 'VC': '207.191.240.0/21',
4695 'VE': '186.88.0.0/13',
4696 'VG': '66.81.192.0/20',
4697 'VI': '146.226.0.0/16',
4698 'VN': '14.160.0.0/11',
4699 'VU': '202.80.32.0/20',
4700 'WF': '117.20.32.0/21',
4701 'WS': '202.4.32.0/19',
4702 'YE': '134.35.0.0/16',
4703 'YT': '41.242.116.0/22',
4704 'ZA': '41.0.0.0/11',
4705 'ZM': '102.144.0.0/13',
4706 'ZW': '102.177.192.0/18',
4707 }
4708
4709 @classmethod
4710 def random_ipv4(cls, code_or_block):
4711 if len(code_or_block) == 2:
4712 block = cls._country_ip_map.get(code_or_block.upper())
4713 if not block:
4714 return None
4715 else:
4716 block = code_or_block
4717 addr, preflen = block.split('/')
4718 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4719 addr_max = addr_min | (0xffffffff >> int(preflen))
4720 return str(socket.inet_ntoa(
4721 struct.pack('!L', random.randint(addr_min, addr_max))))
4722
4723
4724 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4725 def __init__(self, proxies=None):
4726 # Set default handlers
4727 for type in ('http', 'https'):
4728 setattr(self, '%s_open' % type,
4729 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4730 meth(r, proxy, type))
4731 urllib.request.ProxyHandler.__init__(self, proxies)
4732
4733 def proxy_open(self, req, proxy, type):
4734 req_proxy = req.headers.get('Ytdl-request-proxy')
4735 if req_proxy is not None:
4736 proxy = req_proxy
4737 del req.headers['Ytdl-request-proxy']
4738
4739 if proxy == '__noproxy__':
4740 return None # No Proxy
4741 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4742 req.add_header('Ytdl-socks-proxy', proxy)
4743 # yt-dlp's http/https handlers do wrapping the socket with socks
4744 return None
4745 return urllib.request.ProxyHandler.proxy_open(
4746 self, req, proxy, type)
4747
4748
4749 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4750 # released into Public Domain
4751 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4752
4753 def long_to_bytes(n, blocksize=0):
4754 """long_to_bytes(n:long, blocksize:int) : string
4755 Convert a long integer to a byte string.
4756
4757 If optional blocksize is given and greater than zero, pad the front of the
4758 byte string with binary zeros so that the length is a multiple of
4759 blocksize.
4760 """
4761 # after much testing, this algorithm was deemed to be the fastest
4762 s = b''
4763 n = int(n)
4764 while n > 0:
4765 s = struct.pack('>I', n & 0xffffffff) + s
4766 n = n >> 32
4767 # strip off leading zeros
4768 for i in range(len(s)):
4769 if s[i] != b'\000'[0]:
4770 break
4771 else:
4772 # only happens when n == 0
4773 s = b'\000'
4774 i = 0
4775 s = s[i:]
4776 # add back some pad bytes. this could be done more efficiently w.r.t. the
4777 # de-padding being done above, but sigh...
4778 if blocksize > 0 and len(s) % blocksize:
4779 s = (blocksize - len(s) % blocksize) * b'\000' + s
4780 return s
4781
4782
4783 def bytes_to_long(s):
4784 """bytes_to_long(string) : long
4785 Convert a byte string to a long integer.
4786
4787 This is (essentially) the inverse of long_to_bytes().
4788 """
4789 acc = 0
4790 length = len(s)
4791 if length % 4:
4792 extra = (4 - length % 4)
4793 s = b'\000' * extra + s
4794 length = length + extra
4795 for i in range(0, length, 4):
4796 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4797 return acc
4798
4799
4800 def ohdave_rsa_encrypt(data, exponent, modulus):
4801 '''
4802 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4803
4804 Input:
4805 data: data to encrypt, bytes-like object
4806 exponent, modulus: parameter e and N of RSA algorithm, both integer
4807 Output: hex string of encrypted data
4808
4809 Limitation: supports one block encryption only
4810 '''
4811
4812 payload = int(binascii.hexlify(data[::-1]), 16)
4813 encrypted = pow(payload, exponent, modulus)
4814 return '%x' % encrypted
4815
4816
4817 def pkcs1pad(data, length):
4818 """
4819 Padding input data with PKCS#1 scheme
4820
4821 @param {int[]} data input data
4822 @param {int} length target length
4823 @returns {int[]} padded data
4824 """
4825 if len(data) > length - 11:
4826 raise ValueError('Input data too long for PKCS#1 padding')
4827
4828 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4829 return [0, 2] + pseudo_random + [0] + data
4830
4831
4832 def _base_n_table(n, table):
4833 if not table and not n:
4834 raise ValueError('Either table or n must be specified')
4835 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4836
4837 if n and n != len(table):
4838 raise ValueError(f'base {n} exceeds table length {len(table)}')
4839 return table
4840
4841
4842 def encode_base_n(num, n=None, table=None):
4843 """Convert given int to a base-n string"""
4844 table = _base_n_table(n, table)
4845 if not num:
4846 return table[0]
4847
4848 result, base = '', len(table)
4849 while num:
4850 result = table[num % base] + result
4851 num = num // base
4852 return result
4853
4854
4855 def decode_base_n(string, n=None, table=None):
4856 """Convert given base-n string to int"""
4857 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4858 result, base = 0, len(table)
4859 for char in string:
4860 result = result * base + table[char]
4861 return result
4862
4863
4864 def decode_base(value, digits):
4865 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4866 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4867 return decode_base_n(value, table=digits)
4868
4869
4870 def decode_packed_codes(code):
4871 mobj = re.search(PACKED_CODES_RE, code)
4872 obfuscated_code, base, count, symbols = mobj.groups()
4873 base = int(base)
4874 count = int(count)
4875 symbols = symbols.split('|')
4876 symbol_table = {}
4877
4878 while count:
4879 count -= 1
4880 base_n_count = encode_base_n(count, base)
4881 symbol_table[base_n_count] = symbols[count] or base_n_count
4882
4883 return re.sub(
4884 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4885 obfuscated_code)
4886
4887
4888 def caesar(s, alphabet, shift):
4889 if shift == 0:
4890 return s
4891 l = len(alphabet)
4892 return ''.join(
4893 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4894 for c in s)
4895
4896
4897 def rot47(s):
4898 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4899
4900
4901 def parse_m3u8_attributes(attrib):
4902 info = {}
4903 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4904 if val.startswith('"'):
4905 val = val[1:-1]
4906 info[key] = val
4907 return info
4908
4909
4910 def urshift(val, n):
4911 return val >> n if val >= 0 else (val + 0x100000000) >> n
4912
4913
4914 # Based on png2str() written by @gdkchan and improved by @yokrysty
4915 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4916 def decode_png(png_data):
4917 # Reference: https://www.w3.org/TR/PNG/
4918 header = png_data[8:]
4919
4920 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4921 raise OSError('Not a valid PNG file.')
4922
4923 int_map = {1: '>B', 2: '>H', 4: '>I'}
4924 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
4925
4926 chunks = []
4927
4928 while header:
4929 length = unpack_integer(header[:4])
4930 header = header[4:]
4931
4932 chunk_type = header[:4]
4933 header = header[4:]
4934
4935 chunk_data = header[:length]
4936 header = header[length:]
4937
4938 header = header[4:] # Skip CRC
4939
4940 chunks.append({
4941 'type': chunk_type,
4942 'length': length,
4943 'data': chunk_data
4944 })
4945
4946 ihdr = chunks[0]['data']
4947
4948 width = unpack_integer(ihdr[:4])
4949 height = unpack_integer(ihdr[4:8])
4950
4951 idat = b''
4952
4953 for chunk in chunks:
4954 if chunk['type'] == b'IDAT':
4955 idat += chunk['data']
4956
4957 if not idat:
4958 raise OSError('Unable to read PNG data.')
4959
4960 decompressed_data = bytearray(zlib.decompress(idat))
4961
4962 stride = width * 3
4963 pixels = []
4964
4965 def _get_pixel(idx):
4966 x = idx % stride
4967 y = idx // stride
4968 return pixels[y][x]
4969
4970 for y in range(height):
4971 basePos = y * (1 + stride)
4972 filter_type = decompressed_data[basePos]
4973
4974 current_row = []
4975
4976 pixels.append(current_row)
4977
4978 for x in range(stride):
4979 color = decompressed_data[1 + basePos + x]
4980 basex = y * stride + x
4981 left = 0
4982 up = 0
4983
4984 if x > 2:
4985 left = _get_pixel(basex - 3)
4986 if y > 0:
4987 up = _get_pixel(basex - stride)
4988
4989 if filter_type == 1: # Sub
4990 color = (color + left) & 0xff
4991 elif filter_type == 2: # Up
4992 color = (color + up) & 0xff
4993 elif filter_type == 3: # Average
4994 color = (color + ((left + up) >> 1)) & 0xff
4995 elif filter_type == 4: # Paeth
4996 a = left
4997 b = up
4998 c = 0
4999
5000 if x > 2 and y > 0:
5001 c = _get_pixel(basex - stride - 3)
5002
5003 p = a + b - c
5004
5005 pa = abs(p - a)
5006 pb = abs(p - b)
5007 pc = abs(p - c)
5008
5009 if pa <= pb and pa <= pc:
5010 color = (color + a) & 0xff
5011 elif pb <= pc:
5012 color = (color + b) & 0xff
5013 else:
5014 color = (color + c) & 0xff
5015
5016 current_row.append(color)
5017
5018 return width, height, pixels
5019
5020
5021 def write_xattr(path, key, value):
5022 # Windows: Write xattrs to NTFS Alternate Data Streams:
5023 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5024 if compat_os_name == 'nt':
5025 assert ':' not in key
5026 assert os.path.exists(path)
5027
5028 try:
5029 with open(f'{path}:{key}', 'wb') as f:
5030 f.write(value)
5031 except OSError as e:
5032 raise XAttrMetadataError(e.errno, e.strerror)
5033 return
5034
5035 # UNIX Method 1. Use xattrs/pyxattrs modules
5036
5037 setxattr = None
5038 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5039 # Unicode arguments are not supported in pyxattr until version 0.5.0
5040 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5041 if version_tuple(xattr.__version__) >= (0, 5, 0):
5042 setxattr = xattr.set
5043 elif xattr:
5044 setxattr = xattr.setxattr
5045
5046 if setxattr:
5047 try:
5048 setxattr(path, key, value)
5049 except OSError as e:
5050 raise XAttrMetadataError(e.errno, e.strerror)
5051 return
5052
5053 # UNIX Method 2. Use setfattr/xattr executables
5054 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5055 else 'xattr' if check_executable('xattr', ['-h']) else None)
5056 if not exe:
5057 raise XAttrUnavailableError(
5058 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5059 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5060
5061 value = value.decode()
5062 try:
5063 _, stderr, returncode = Popen.run(
5064 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5065 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5066 except OSError as e:
5067 raise XAttrMetadataError(e.errno, e.strerror)
5068 if returncode:
5069 raise XAttrMetadataError(returncode, stderr)
5070
5071
5072 def random_birthday(year_field, month_field, day_field):
5073 start_date = datetime.date(1950, 1, 1)
5074 end_date = datetime.date(1995, 12, 31)
5075 offset = random.randint(0, (end_date - start_date).days)
5076 random_date = start_date + datetime.timedelta(offset)
5077 return {
5078 year_field: str(random_date.year),
5079 month_field: str(random_date.month),
5080 day_field: str(random_date.day),
5081 }
5082
5083
5084 # Templates for internet shortcut files, which are plain text files.
5085 DOT_URL_LINK_TEMPLATE = '''\
5086 [InternetShortcut]
5087 URL=%(url)s
5088 '''
5089
5090 DOT_WEBLOC_LINK_TEMPLATE = '''\
5091 <?xml version="1.0" encoding="UTF-8"?>
5092 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5093 <plist version="1.0">
5094 <dict>
5095 \t<key>URL</key>
5096 \t<string>%(url)s</string>
5097 </dict>
5098 </plist>
5099 '''
5100
5101 DOT_DESKTOP_LINK_TEMPLATE = '''\
5102 [Desktop Entry]
5103 Encoding=UTF-8
5104 Name=%(filename)s
5105 Type=Link
5106 URL=%(url)s
5107 Icon=text-html
5108 '''
5109
5110 LINK_TEMPLATES = {
5111 'url': DOT_URL_LINK_TEMPLATE,
5112 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5113 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5114 }
5115
5116
5117 def iri_to_uri(iri):
5118 """
5119 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5120
5121 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5122 """
5123
5124 iri_parts = urllib.parse.urlparse(iri)
5125
5126 if '[' in iri_parts.netloc:
5127 raise ValueError('IPv6 URIs are not, yet, supported.')
5128 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5129
5130 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5131
5132 net_location = ''
5133 if iri_parts.username:
5134 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5135 if iri_parts.password is not None:
5136 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5137 net_location += '@'
5138
5139 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5140 # The 'idna' encoding produces ASCII text.
5141 if iri_parts.port is not None and iri_parts.port != 80:
5142 net_location += ':' + str(iri_parts.port)
5143
5144 return urllib.parse.urlunparse(
5145 (iri_parts.scheme,
5146 net_location,
5147
5148 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5149
5150 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5151 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5152
5153 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5154 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5155
5156 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5157
5158 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5159
5160
5161 def to_high_limit_path(path):
5162 if sys.platform in ['win32', 'cygwin']:
5163 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5164 return '\\\\?\\' + os.path.abspath(path)
5165
5166 return path
5167
5168
5169 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5170 val = traverse_obj(obj, *variadic(field))
5171 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5172 return default
5173 return template % func(val)
5174
5175
5176 def clean_podcast_url(url):
5177 return re.sub(r'''(?x)
5178 (?:
5179 (?:
5180 chtbl\.com/track|
5181 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5182 play\.podtrac\.com
5183 )/[^/]+|
5184 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5185 flex\.acast\.com|
5186 pd(?:
5187 cn\.co| # https://podcorn.com/analytics-prefix/
5188 st\.fm # https://podsights.com/docs/
5189 )/e
5190 )/''', '', url)
5191
5192
5193 _HEX_TABLE = '0123456789abcdef'
5194
5195
5196 def random_uuidv4():
5197 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5198
5199
5200 def make_dir(path, to_screen=None):
5201 try:
5202 dn = os.path.dirname(path)
5203 if dn and not os.path.exists(dn):
5204 os.makedirs(dn)
5205 return True
5206 except OSError as err:
5207 if callable(to_screen) is not None:
5208 to_screen('unable to create directory ' + error_to_compat_str(err))
5209 return False
5210
5211
5212 def get_executable_path():
5213 from .update import _get_variant_and_executable_path
5214
5215 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5216
5217
5218 def load_plugins(name, suffix, namespace):
5219 classes = {}
5220 with contextlib.suppress(FileNotFoundError):
5221 plugins_spec = importlib.util.spec_from_file_location(
5222 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5223 plugins = importlib.util.module_from_spec(plugins_spec)
5224 sys.modules[plugins_spec.name] = plugins
5225 plugins_spec.loader.exec_module(plugins)
5226 for name in dir(plugins):
5227 if name in namespace:
5228 continue
5229 if not name.endswith(suffix):
5230 continue
5231 klass = getattr(plugins, name)
5232 classes[name] = namespace[name] = klass
5233 return classes
5234
5235
5236 def traverse_obj(
5237 obj, *path_list, default=None, expected_type=None, get_all=True,
5238 casesense=True, is_user_input=False, traverse_string=False):
5239 ''' Traverse nested list/dict/tuple
5240 @param path_list A list of paths which are checked one by one.
5241 Each path is a list of keys where each key is a:
5242 - None: Do nothing
5243 - string: A dictionary key
5244 - int: An index into a list
5245 - tuple: A list of keys all of which will be traversed
5246 - Ellipsis: Fetch all values in the object
5247 - Function: Takes the key and value as arguments
5248 and returns whether the key matches or not
5249 @param default Default value to return
5250 @param expected_type Only accept final value of this type (Can also be any callable)
5251 @param get_all Return all the values obtained from a path or only the first one
5252 @param casesense Whether to consider dictionary keys as case sensitive
5253 @param is_user_input Whether the keys are generated from user input. If True,
5254 strings are converted to int/slice if necessary
5255 @param traverse_string Whether to traverse inside strings. If True, any
5256 non-compatible object will also be converted into a string
5257 # TODO: Write tests
5258 '''
5259 if not casesense:
5260 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5261 path_list = (map(_lower, variadic(path)) for path in path_list)
5262
5263 def _traverse_obj(obj, path, _current_depth=0):
5264 nonlocal depth
5265 path = tuple(variadic(path))
5266 for i, key in enumerate(path):
5267 if None in (key, obj):
5268 return obj
5269 if isinstance(key, (list, tuple)):
5270 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5271 key = ...
5272 if key is ...:
5273 obj = (obj.values() if isinstance(obj, dict)
5274 else obj if isinstance(obj, (list, tuple, LazyList))
5275 else str(obj) if traverse_string else [])
5276 _current_depth += 1
5277 depth = max(depth, _current_depth)
5278 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5279 elif callable(key):
5280 if isinstance(obj, (list, tuple, LazyList)):
5281 obj = enumerate(obj)
5282 elif isinstance(obj, dict):
5283 obj = obj.items()
5284 else:
5285 if not traverse_string:
5286 return None
5287 obj = str(obj)
5288 _current_depth += 1
5289 depth = max(depth, _current_depth)
5290 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5291 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5292 obj = (obj.get(key) if casesense or (key in obj)
5293 else next((v for k, v in obj.items() if _lower(k) == key), None))
5294 else:
5295 if is_user_input:
5296 key = (int_or_none(key) if ':' not in key
5297 else slice(*map(int_or_none, key.split(':'))))
5298 if key == slice(None):
5299 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5300 if not isinstance(key, (int, slice)):
5301 return None
5302 if not isinstance(obj, (list, tuple, LazyList)):
5303 if not traverse_string:
5304 return None
5305 obj = str(obj)
5306 try:
5307 obj = obj[key]
5308 except IndexError:
5309 return None
5310 return obj
5311
5312 if isinstance(expected_type, type):
5313 type_test = lambda val: val if isinstance(val, expected_type) else None
5314 else:
5315 type_test = expected_type or IDENTITY
5316
5317 for path in path_list:
5318 depth = 0
5319 val = _traverse_obj(obj, path)
5320 if val is not None:
5321 if depth:
5322 for _ in range(depth - 1):
5323 val = itertools.chain.from_iterable(v for v in val if v is not None)
5324 val = [v for v in map(type_test, val) if v is not None]
5325 if val:
5326 return val if get_all else val[0]
5327 else:
5328 val = type_test(val)
5329 if val is not None:
5330 return val
5331 return default
5332
5333
5334 def traverse_dict(dictn, keys, casesense=True):
5335 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5336 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5337 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5338
5339
5340 def get_first(obj, keys, **kwargs):
5341 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5342
5343
5344 def variadic(x, allowed_types=(str, bytes, dict)):
5345 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5346
5347
5348 def time_seconds(**kwargs):
5349 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5350 return t.timestamp()
5351
5352
5353 # create a JSON Web Signature (jws) with HS256 algorithm
5354 # the resulting format is in JWS Compact Serialization
5355 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5356 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5357 def jwt_encode_hs256(payload_data, key, headers={}):
5358 header_data = {
5359 'alg': 'HS256',
5360 'typ': 'JWT',
5361 }
5362 if headers:
5363 header_data.update(headers)
5364 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5365 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5366 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5367 signature_b64 = base64.b64encode(h.digest())
5368 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5369 return token
5370
5371
5372 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5373 def jwt_decode_hs256(jwt):
5374 header_b64, payload_b64, signature_b64 = jwt.split('.')
5375 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5376 return payload_data
5377
5378
5379 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5380
5381
5382 @functools.cache
5383 def supports_terminal_sequences(stream):
5384 if compat_os_name == 'nt':
5385 if not WINDOWS_VT_MODE:
5386 return False
5387 elif not os.getenv('TERM'):
5388 return False
5389 try:
5390 return stream.isatty()
5391 except BaseException:
5392 return False
5393
5394
5395 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5396 if get_windows_version() < (10, 0, 10586):
5397 return
5398 global WINDOWS_VT_MODE
5399 try:
5400 Popen.run('', shell=True)
5401 except Exception:
5402 return
5403
5404 WINDOWS_VT_MODE = True
5405 supports_terminal_sequences.cache_clear()
5406
5407
5408 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5409
5410
5411 def remove_terminal_sequences(string):
5412 return _terminal_sequences_re.sub('', string)
5413
5414
5415 def number_of_digits(number):
5416 return len('%d' % number)
5417
5418
5419 def join_nonempty(*values, delim='-', from_dict=None):
5420 if from_dict is not None:
5421 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5422 return delim.join(map(str, filter(None, values)))
5423
5424
5425 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5426 """
5427 Find the largest format dimensions in terms of video width and, for each thumbnail:
5428 * Modify the URL: Match the width with the provided regex and replace with the former width
5429 * Update dimensions
5430
5431 This function is useful with video services that scale the provided thumbnails on demand
5432 """
5433 _keys = ('width', 'height')
5434 max_dimensions = max(
5435 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5436 default=(0, 0))
5437 if not max_dimensions[0]:
5438 return thumbnails
5439 return [
5440 merge_dicts(
5441 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5442 dict(zip(_keys, max_dimensions)), thumbnail)
5443 for thumbnail in thumbnails
5444 ]
5445
5446
5447 def parse_http_range(range):
5448 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5449 if not range:
5450 return None, None, None
5451 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5452 if not crg:
5453 return None, None, None
5454 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5455
5456
5457 def read_stdin(what):
5458 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5459 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5460 return sys.stdin
5461
5462
5463 def determine_file_encoding(data):
5464 """
5465 Detect the text encoding used
5466 @returns (encoding, bytes to skip)
5467 """
5468
5469 # BOM marks are given priority over declarations
5470 for bom, enc in BOMS:
5471 if data.startswith(bom):
5472 return enc, len(bom)
5473
5474 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5475 # We ignore the endianness to get a good enough match
5476 data = data.replace(b'\0', b'')
5477 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5478 return mobj.group(1).decode() if mobj else None, 0
5479
5480
5481 class Config:
5482 own_args = None
5483 parsed_args = None
5484 filename = None
5485 __initialized = False
5486
5487 def __init__(self, parser, label=None):
5488 self.parser, self.label = parser, label
5489 self._loaded_paths, self.configs = set(), []
5490
5491 def init(self, args=None, filename=None):
5492 assert not self.__initialized
5493 self.own_args, self.filename = args, filename
5494 return self.load_configs()
5495
5496 def load_configs(self):
5497 directory = ''
5498 if self.filename:
5499 location = os.path.realpath(self.filename)
5500 directory = os.path.dirname(location)
5501 if location in self._loaded_paths:
5502 return False
5503 self._loaded_paths.add(location)
5504
5505 self.__initialized = True
5506 opts, _ = self.parser.parse_known_args(self.own_args)
5507 self.parsed_args = self.own_args
5508 for location in opts.config_locations or []:
5509 if location == '-':
5510 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5511 continue
5512 location = os.path.join(directory, expand_path(location))
5513 if os.path.isdir(location):
5514 location = os.path.join(location, 'yt-dlp.conf')
5515 if not os.path.exists(location):
5516 self.parser.error(f'config location {location} does not exist')
5517 self.append_config(self.read_file(location), location)
5518 return True
5519
5520 def __str__(self):
5521 label = join_nonempty(
5522 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5523 delim=' ')
5524 return join_nonempty(
5525 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5526 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5527 delim='\n')
5528
5529 @staticmethod
5530 def read_file(filename, default=[]):
5531 try:
5532 optionf = open(filename, 'rb')
5533 except OSError:
5534 return default # silently skip if file is not present
5535 try:
5536 enc, skip = determine_file_encoding(optionf.read(512))
5537 optionf.seek(skip, io.SEEK_SET)
5538 except OSError:
5539 enc = None # silently skip read errors
5540 try:
5541 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5542 contents = optionf.read().decode(enc or preferredencoding())
5543 res = shlex.split(contents, comments=True)
5544 except Exception as err:
5545 raise ValueError(f'Unable to parse "{filename}": {err}')
5546 finally:
5547 optionf.close()
5548 return res
5549
5550 @staticmethod
5551 def hide_login_info(opts):
5552 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5553 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5554
5555 def _scrub_eq(o):
5556 m = eqre.match(o)
5557 if m:
5558 return m.group('key') + '=PRIVATE'
5559 else:
5560 return o
5561
5562 opts = list(map(_scrub_eq, opts))
5563 for idx, opt in enumerate(opts):
5564 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5565 opts[idx + 1] = 'PRIVATE'
5566 return opts
5567
5568 def append_config(self, *args, label=None):
5569 config = type(self)(self.parser, label)
5570 config._loaded_paths = self._loaded_paths
5571 if config.init(*args):
5572 self.configs.append(config)
5573
5574 @property
5575 def all_args(self):
5576 for config in reversed(self.configs):
5577 yield from config.all_args
5578 yield from self.parsed_args or []
5579
5580 def parse_known_args(self, **kwargs):
5581 return self.parser.parse_known_args(self.all_args, **kwargs)
5582
5583 def parse_args(self):
5584 return self.parser.parse_args(self.all_args)
5585
5586
5587 class WebSocketsWrapper():
5588 """Wraps websockets module to use in non-async scopes"""
5589 pool = None
5590
5591 def __init__(self, url, headers=None, connect=True):
5592 self.loop = asyncio.new_event_loop()
5593 # XXX: "loop" is deprecated
5594 self.conn = websockets.connect(
5595 url, extra_headers=headers, ping_interval=None,
5596 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5597 if connect:
5598 self.__enter__()
5599 atexit.register(self.__exit__, None, None, None)
5600
5601 def __enter__(self):
5602 if not self.pool:
5603 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5604 return self
5605
5606 def send(self, *args):
5607 self.run_with_loop(self.pool.send(*args), self.loop)
5608
5609 def recv(self, *args):
5610 return self.run_with_loop(self.pool.recv(*args), self.loop)
5611
5612 def __exit__(self, type, value, traceback):
5613 try:
5614 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5615 finally:
5616 self.loop.close()
5617 self._cancel_all_tasks(self.loop)
5618
5619 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5620 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5621 @staticmethod
5622 def run_with_loop(main, loop):
5623 if not asyncio.iscoroutine(main):
5624 raise ValueError(f'a coroutine was expected, got {main!r}')
5625
5626 try:
5627 return loop.run_until_complete(main)
5628 finally:
5629 loop.run_until_complete(loop.shutdown_asyncgens())
5630 if hasattr(loop, 'shutdown_default_executor'):
5631 loop.run_until_complete(loop.shutdown_default_executor())
5632
5633 @staticmethod
5634 def _cancel_all_tasks(loop):
5635 to_cancel = asyncio.all_tasks(loop)
5636
5637 if not to_cancel:
5638 return
5639
5640 for task in to_cancel:
5641 task.cancel()
5642
5643 # XXX: "loop" is removed in python 3.10+
5644 loop.run_until_complete(
5645 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5646
5647 for task in to_cancel:
5648 if task.cancelled():
5649 continue
5650 if task.exception() is not None:
5651 loop.call_exception_handler({
5652 'message': 'unhandled exception during asyncio.run() shutdown',
5653 'exception': task.exception(),
5654 'task': task,
5655 })
5656
5657
5658 def merge_headers(*dicts):
5659 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5660 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5661
5662
5663 def cached_method(f):
5664 """Cache a method"""
5665 signature = inspect.signature(f)
5666
5667 @functools.wraps(f)
5668 def wrapper(self, *args, **kwargs):
5669 bound_args = signature.bind(self, *args, **kwargs)
5670 bound_args.apply_defaults()
5671 key = tuple(bound_args.arguments.values())
5672
5673 if not hasattr(self, '__cached_method__cache'):
5674 self.__cached_method__cache = {}
5675 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5676 if key not in cache:
5677 cache[key] = f(self, *args, **kwargs)
5678 return cache[key]
5679 return wrapper
5680
5681
5682 class classproperty:
5683 """property access for class methods"""
5684
5685 def __init__(self, func):
5686 functools.update_wrapper(self, func)
5687 self.func = func
5688
5689 def __get__(self, _, cls):
5690 return self.func(cls)
5691
5692
5693 class Namespace(types.SimpleNamespace):
5694 """Immutable namespace"""
5695
5696 def __iter__(self):
5697 return iter(self.__dict__.values())
5698
5699 @property
5700 def items_(self):
5701 return self.__dict__.items()
5702
5703
5704 MEDIA_EXTENSIONS = Namespace(
5705 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5706 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5707 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5708 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5709 thumbnails=('jpg', 'png', 'webp'),
5710 storyboards=('mhtml', ),
5711 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5712 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5713 )
5714 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5715 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5716
5717 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5718
5719
5720 class RetryManager:
5721 """Usage:
5722 for retry in RetryManager(...):
5723 try:
5724 ...
5725 except SomeException as err:
5726 retry.error = err
5727 continue
5728 """
5729 attempt, _error = 0, None
5730
5731 def __init__(self, _retries, _error_callback, **kwargs):
5732 self.retries = _retries or 0
5733 self.error_callback = functools.partial(_error_callback, **kwargs)
5734
5735 def _should_retry(self):
5736 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5737
5738 @property
5739 def error(self):
5740 if self._error is NO_DEFAULT:
5741 return None
5742 return self._error
5743
5744 @error.setter
5745 def error(self, value):
5746 self._error = value
5747
5748 def __iter__(self):
5749 while self._should_retry():
5750 self.error = NO_DEFAULT
5751 self.attempt += 1
5752 yield self
5753 if self.error:
5754 self.error_callback(self.error, self.attempt, self.retries)
5755
5756 @staticmethod
5757 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5758 """Utility function for reporting retries"""
5759 if count > retries:
5760 if error:
5761 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5762 raise e
5763
5764 if not count:
5765 return warn(e)
5766 elif isinstance(e, ExtractorError):
5767 e = remove_end(str(e.cause) or e.orig_msg, '.')
5768 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5769
5770 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5771 if delay:
5772 info(f'Sleeping {delay:.2f} seconds ...')
5773 time.sleep(delay)
5774
5775
5776 def make_archive_id(ie, video_id):
5777 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5778 return f'{ie_key.lower()} {video_id}'
5779
5780
5781 def truncate_string(s, left, right=0):
5782 assert left > 3 and right >= 0
5783 if s is None or len(s) <= left + right:
5784 return s
5785 return f'{s[:left-3]}...{s[-right:]}'
5786
5787
5788 # Deprecated
5789 has_certifi = bool(certifi)
5790 has_websockets = bool(websockets)