]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[extractor/reddit] Extract video embeds in text posts (#5677)
[yt-dlp.git] / yt_dlp / utils.py
1 import asyncio
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import collections.abc
9 import contextlib
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import gzip
15 import hashlib
16 import hmac
17 import html.entities
18 import html.parser
19 import http.client
20 import http.cookiejar
21 import importlib.util
22 import inspect
23 import io
24 import itertools
25 import json
26 import locale
27 import math
28 import mimetypes
29 import operator
30 import os
31 import platform
32 import random
33 import re
34 import shlex
35 import socket
36 import ssl
37 import struct
38 import subprocess
39 import sys
40 import tempfile
41 import time
42 import traceback
43 import types
44 import unicodedata
45 import urllib.error
46 import urllib.parse
47 import urllib.request
48 import xml.etree.ElementTree
49 import zlib
50
51 from .compat import functools # isort: split
52 from .compat import (
53 compat_etree_fromstring,
54 compat_expanduser,
55 compat_HTMLParseError,
56 compat_os_name,
57 compat_shlex_quote,
58 )
59 from .dependencies import brotli, certifi, websockets, xattr
60 from .socks import ProxyType, sockssocket
61
62
63 def register_socks_protocols():
64 # "Register" SOCKS protocols
65 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
66 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
67 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
68 if scheme not in urllib.parse.uses_netloc:
69 urllib.parse.uses_netloc.append(scheme)
70
71
72 # This is not clearly defined otherwise
73 compiled_regex_type = type(re.compile(''))
74
75
76 def random_user_agent():
77 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
78 _CHROME_VERSIONS = (
79 '90.0.4430.212',
80 '90.0.4430.24',
81 '90.0.4430.70',
82 '90.0.4430.72',
83 '90.0.4430.85',
84 '90.0.4430.93',
85 '91.0.4472.101',
86 '91.0.4472.106',
87 '91.0.4472.114',
88 '91.0.4472.124',
89 '91.0.4472.164',
90 '91.0.4472.19',
91 '91.0.4472.77',
92 '92.0.4515.107',
93 '92.0.4515.115',
94 '92.0.4515.131',
95 '92.0.4515.159',
96 '92.0.4515.43',
97 '93.0.4556.0',
98 '93.0.4577.15',
99 '93.0.4577.63',
100 '93.0.4577.82',
101 '94.0.4606.41',
102 '94.0.4606.54',
103 '94.0.4606.61',
104 '94.0.4606.71',
105 '94.0.4606.81',
106 '94.0.4606.85',
107 '95.0.4638.17',
108 '95.0.4638.50',
109 '95.0.4638.54',
110 '95.0.4638.69',
111 '95.0.4638.74',
112 '96.0.4664.18',
113 '96.0.4664.45',
114 '96.0.4664.55',
115 '96.0.4664.93',
116 '97.0.4692.20',
117 )
118 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
119
120
121 SUPPORTED_ENCODINGS = [
122 'gzip', 'deflate'
123 ]
124 if brotli:
125 SUPPORTED_ENCODINGS.append('br')
126
127 std_headers = {
128 'User-Agent': random_user_agent(),
129 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
130 'Accept-Language': 'en-us,en;q=0.5',
131 'Sec-Fetch-Mode': 'navigate',
132 }
133
134
135 USER_AGENTS = {
136 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
137 }
138
139
140 NO_DEFAULT = object()
141 IDENTITY = lambda x: x
142
143 ENGLISH_MONTH_NAMES = [
144 'January', 'February', 'March', 'April', 'May', 'June',
145 'July', 'August', 'September', 'October', 'November', 'December']
146
147 MONTH_NAMES = {
148 'en': ENGLISH_MONTH_NAMES,
149 'fr': [
150 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
151 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
152 # these follow the genitive grammatical case (dopełniacz)
153 # some websites might be using nominative, which will require another month list
154 # https://en.wikibooks.org/wiki/Polish/Noun_cases
155 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca',
156 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'],
157 }
158
159 # From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
160 TIMEZONE_NAMES = {
161 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
162 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
163 'EST': -5, 'EDT': -4, # Eastern
164 'CST': -6, 'CDT': -5, # Central
165 'MST': -7, 'MDT': -6, # Mountain
166 'PST': -8, 'PDT': -7 # Pacific
167 }
168
169 # needed for sanitizing filenames in restricted mode
170 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
171 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
172 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
173
174 DATE_FORMATS = (
175 '%d %B %Y',
176 '%d %b %Y',
177 '%B %d %Y',
178 '%B %dst %Y',
179 '%B %dnd %Y',
180 '%B %drd %Y',
181 '%B %dth %Y',
182 '%b %d %Y',
183 '%b %dst %Y',
184 '%b %dnd %Y',
185 '%b %drd %Y',
186 '%b %dth %Y',
187 '%b %dst %Y %I:%M',
188 '%b %dnd %Y %I:%M',
189 '%b %drd %Y %I:%M',
190 '%b %dth %Y %I:%M',
191 '%Y %m %d',
192 '%Y-%m-%d',
193 '%Y.%m.%d.',
194 '%Y/%m/%d',
195 '%Y/%m/%d %H:%M',
196 '%Y/%m/%d %H:%M:%S',
197 '%Y%m%d%H%M',
198 '%Y%m%d%H%M%S',
199 '%Y%m%d',
200 '%Y-%m-%d %H:%M',
201 '%Y-%m-%d %H:%M:%S',
202 '%Y-%m-%d %H:%M:%S.%f',
203 '%Y-%m-%d %H:%M:%S:%f',
204 '%d.%m.%Y %H:%M',
205 '%d.%m.%Y %H.%M',
206 '%Y-%m-%dT%H:%M:%SZ',
207 '%Y-%m-%dT%H:%M:%S.%fZ',
208 '%Y-%m-%dT%H:%M:%S.%f0Z',
209 '%Y-%m-%dT%H:%M:%S',
210 '%Y-%m-%dT%H:%M:%S.%f',
211 '%Y-%m-%dT%H:%M',
212 '%b %d %Y at %H:%M',
213 '%b %d %Y at %H:%M:%S',
214 '%B %d %Y at %H:%M',
215 '%B %d %Y at %H:%M:%S',
216 '%H:%M %d-%b-%Y',
217 )
218
219 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
220 DATE_FORMATS_DAY_FIRST.extend([
221 '%d-%m-%Y',
222 '%d.%m.%Y',
223 '%d.%m.%y',
224 '%d/%m/%Y',
225 '%d/%m/%y',
226 '%d/%m/%Y %H:%M:%S',
227 '%d-%m-%Y %H:%M',
228 ])
229
230 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
231 DATE_FORMATS_MONTH_FIRST.extend([
232 '%m-%d-%Y',
233 '%m.%d.%Y',
234 '%m/%d/%Y',
235 '%m/%d/%y',
236 '%m/%d/%Y %H:%M:%S',
237 ])
238
239 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
240 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>'
241
242 NUMBER_RE = r'\d+(?:\.\d+)?'
243
244
245 @functools.cache
246 def preferredencoding():
247 """Get preferred encoding.
248
249 Returns the best encoding scheme for the system, based on
250 locale.getpreferredencoding() and some further tweaks.
251 """
252 try:
253 pref = locale.getpreferredencoding()
254 'TEST'.encode(pref)
255 except Exception:
256 pref = 'UTF-8'
257
258 return pref
259
260
261 def write_json_file(obj, fn):
262 """ Encode obj as JSON and write it to fn, atomically if possible """
263
264 tf = tempfile.NamedTemporaryFile(
265 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
266 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
267
268 try:
269 with tf:
270 json.dump(obj, tf, ensure_ascii=False)
271 if sys.platform == 'win32':
272 # Need to remove existing file on Windows, else os.rename raises
273 # WindowsError or FileExistsError.
274 with contextlib.suppress(OSError):
275 os.unlink(fn)
276 with contextlib.suppress(OSError):
277 mask = os.umask(0)
278 os.umask(mask)
279 os.chmod(tf.name, 0o666 & ~mask)
280 os.rename(tf.name, fn)
281 except Exception:
282 with contextlib.suppress(OSError):
283 os.remove(tf.name)
284 raise
285
286
287 def find_xpath_attr(node, xpath, key, val=None):
288 """ Find the xpath xpath[@key=val] """
289 assert re.match(r'^[a-zA-Z_-]+$', key)
290 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
291 return node.find(expr)
292
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295
296
297 def xpath_with_ns(path, ns_map):
298 components = [c.split(':') for c in path.split('/')]
299 replaced = []
300 for c in components:
301 if len(c) == 1:
302 replaced.append(c[0])
303 else:
304 ns, tag = c
305 replaced.append('{%s}%s' % (ns_map[ns], tag))
306 return '/'.join(replaced)
307
308
309 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
310 def _find_xpath(xpath):
311 return node.find(xpath)
312
313 if isinstance(xpath, str):
314 n = _find_xpath(xpath)
315 else:
316 for xp in xpath:
317 n = _find_xpath(xp)
318 if n is not None:
319 break
320
321 if n is None:
322 if default is not NO_DEFAULT:
323 return default
324 elif fatal:
325 name = xpath if name is None else name
326 raise ExtractorError('Could not find XML element %s' % name)
327 else:
328 return None
329 return n
330
331
332 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
333 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
334 if n is None or n == default:
335 return n
336 if n.text is None:
337 if default is not NO_DEFAULT:
338 return default
339 elif fatal:
340 name = xpath if name is None else name
341 raise ExtractorError('Could not find XML element\'s text %s' % name)
342 else:
343 return None
344 return n.text
345
346
347 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
348 n = find_xpath_attr(node, xpath, key)
349 if n is None:
350 if default is not NO_DEFAULT:
351 return default
352 elif fatal:
353 name = f'{xpath}[@{key}]' if name is None else name
354 raise ExtractorError('Could not find XML attribute %s' % name)
355 else:
356 return None
357 return n.attrib[key]
358
359
360 def get_element_by_id(id, html, **kwargs):
361 """Return the content of the tag with the specified ID in the passed HTML document"""
362 return get_element_by_attribute('id', id, html, **kwargs)
363
364
365 def get_element_html_by_id(id, html, **kwargs):
366 """Return the html of the tag with the specified ID in the passed HTML document"""
367 return get_element_html_by_attribute('id', id, html, **kwargs)
368
369
370 def get_element_by_class(class_name, html):
371 """Return the content of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
376 def get_element_html_by_class(class_name, html):
377 """Return the html of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_html_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
382 def get_element_by_attribute(attribute, value, html, **kwargs):
383 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
384 return retval[0] if retval else None
385
386
387 def get_element_html_by_attribute(attribute, value, html, **kargs):
388 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
389 return retval[0] if retval else None
390
391
392 def get_elements_by_class(class_name, html, **kargs):
393 """Return the content of all tags with the specified class in the passed HTML document as a list"""
394 return get_elements_by_attribute(
395 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
396 html, escape_value=False)
397
398
399 def get_elements_html_by_class(class_name, html):
400 """Return the html of all tags with the specified class in the passed HTML document as a list"""
401 return get_elements_html_by_attribute(
402 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
403 html, escape_value=False)
404
405
406 def get_elements_by_attribute(*args, **kwargs):
407 """Return the content of the tag with the specified attribute in the passed HTML document"""
408 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
409
410
411 def get_elements_html_by_attribute(*args, **kwargs):
412 """Return the html of the tag with the specified attribute in the passed HTML document"""
413 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416 def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
417 """
418 Return the text (content) and the html (whole) of the tag with the specified
419 attribute in the passed HTML document
420 """
421 if not value:
422 return
423
424 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
425
426 value = re.escape(value) if escape_value else value
427
428 partial_element_re = rf'''(?x)
429 <(?P<tag>{tag})
430 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
431 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
432 '''
433
434 for m in re.finditer(partial_element_re, html):
435 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
436
437 yield (
438 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
439 whole
440 )
441
442
443 class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
444 """
445 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
446 closing tag for the first opening tag it has encountered, and can be used
447 as a context manager
448 """
449
450 class HTMLBreakOnClosingTagException(Exception):
451 pass
452
453 def __init__(self):
454 self.tagstack = collections.deque()
455 html.parser.HTMLParser.__init__(self)
456
457 def __enter__(self):
458 return self
459
460 def __exit__(self, *_):
461 self.close()
462
463 def close(self):
464 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
465 # so data remains buffered; we no longer have any interest in it, thus
466 # override this method to discard it
467 pass
468
469 def handle_starttag(self, tag, _):
470 self.tagstack.append(tag)
471
472 def handle_endtag(self, tag):
473 if not self.tagstack:
474 raise compat_HTMLParseError('no tags in the stack')
475 while self.tagstack:
476 inner_tag = self.tagstack.pop()
477 if inner_tag == tag:
478 break
479 else:
480 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
481 if not self.tagstack:
482 raise self.HTMLBreakOnClosingTagException()
483
484
485 # XXX: This should be far less strict
486 def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
521 class HTMLAttributeParser(html.parser.HTMLParser):
522 """Trivial HTML parser to gather the attributes for a single element"""
523
524 def __init__(self):
525 self.attrs = {}
526 html.parser.HTMLParser.__init__(self)
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
530 raise compat_HTMLParseError('done')
531
532
533 class HTMLListAttrsParser(html.parser.HTMLParser):
534 """HTML parser to gather the attributes for the elements of a list"""
535
536 def __init__(self):
537 html.parser.HTMLParser.__init__(self)
538 self.items = []
539 self._level = 0
540
541 def handle_starttag(self, tag, attrs):
542 if tag == 'li' and self._level == 0:
543 self.items.append(dict(attrs))
544 self._level += 1
545
546 def handle_endtag(self, tag):
547 self._level -= 1
548
549
550 def extract_attributes(html_element):
551 """Given a string for an HTML element such as
552 <el
553 a="foo" B="bar" c="&98;az" d=boz
554 empty= noval entity="&amp;"
555 sq='"' dq="'"
556 >
557 Decode and return a dictionary of attributes.
558 {
559 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
560 'empty': '', 'noval': None, 'entity': '&',
561 'sq': '"', 'dq': '\''
562 }.
563 """
564 parser = HTMLAttributeParser()
565 with contextlib.suppress(compat_HTMLParseError):
566 parser.feed(html_element)
567 parser.close()
568 return parser.attrs
569
570
571 def parse_list(webpage):
572 """Given a string for an series of HTML <li> elements,
573 return a dictionary of their attributes"""
574 parser = HTMLListAttrsParser()
575 parser.feed(webpage)
576 parser.close()
577 return parser.items
578
579
580 def clean_html(html):
581 """Clean an HTML snippet into a readable string"""
582
583 if html is None: # Convenience for sanitizing descriptions etc.
584 return html
585
586 html = re.sub(r'\s+', ' ', html)
587 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
588 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
589 # Strip html tags
590 html = re.sub('<.*?>', '', html)
591 # Replace html entities
592 html = unescapeHTML(html)
593 return html.strip()
594
595
596 class LenientJSONDecoder(json.JSONDecoder):
597 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
598 self.transform_source, self.ignore_extra = transform_source, ignore_extra
599 super().__init__(*args, **kwargs)
600
601 def decode(self, s):
602 if self.transform_source:
603 s = self.transform_source(s)
604 try:
605 if self.ignore_extra:
606 return self.raw_decode(s.lstrip())[0]
607 return super().decode(s)
608 except json.JSONDecodeError as e:
609 if e.pos is not None:
610 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
611 raise
612
613
614 def sanitize_open(filename, open_mode):
615 """Try to open the given filename, and slightly tweak it if this fails.
616
617 Attempts to open the given filename. If this fails, it tries to change
618 the filename slightly, step by step, until it's either able to open it
619 or it fails and raises a final exception, like the standard open()
620 function.
621
622 It returns the tuple (stream, definitive_file_name).
623 """
624 if filename == '-':
625 if sys.platform == 'win32':
626 import msvcrt
627
628 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
629 with contextlib.suppress(io.UnsupportedOperation):
630 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
631 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
632
633 for attempt in range(2):
634 try:
635 try:
636 if sys.platform == 'win32':
637 # FIXME: An exclusive lock also locks the file from being read.
638 # Since windows locks are mandatory, don't lock the file on windows (for now).
639 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
640 raise LockingUnsupportedError()
641 stream = locked_file(filename, open_mode, block=False).__enter__()
642 except OSError:
643 stream = open(filename, open_mode)
644 return stream, filename
645 except OSError as err:
646 if attempt or err.errno in (errno.EACCES,):
647 raise
648 old_filename, filename = filename, sanitize_path(filename)
649 if old_filename == filename:
650 raise
651
652
653 def timeconvert(timestr):
654 """Convert RFC 2822 defined time string into system timestamp"""
655 timestamp = None
656 timetuple = email.utils.parsedate_tz(timestr)
657 if timetuple is not None:
658 timestamp = email.utils.mktime_tz(timetuple)
659 return timestamp
660
661
662 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
663 """Sanitizes a string so it could be used as part of a filename.
664 @param restricted Use a stricter subset of allowed characters
665 @param is_id Whether this is an ID that should be kept unchanged if possible.
666 If unset, yt-dlp's new sanitization rules are in effect
667 """
668 if s == '':
669 return ''
670
671 def replace_insane(char):
672 if restricted and char in ACCENT_CHARS:
673 return ACCENT_CHARS[char]
674 elif not restricted and char == '\n':
675 return '\0 '
676 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
677 # Replace with their full-width unicode counterparts
678 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
679 elif char == '?' or ord(char) < 32 or ord(char) == 127:
680 return ''
681 elif char == '"':
682 return '' if restricted else '\''
683 elif char == ':':
684 return '\0_\0-' if restricted else '\0 \0-'
685 elif char in '\\/|*<>':
686 return '\0_'
687 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
688 return '\0_'
689 return char
690
691 # Replace look-alike Unicode glyphs
692 if restricted and (is_id is NO_DEFAULT or not is_id):
693 s = unicodedata.normalize('NFKC', s)
694 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
695 result = ''.join(map(replace_insane, s))
696 if is_id is NO_DEFAULT:
697 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
698 STRIP_RE = r'(?:\0.|[ _-])*'
699 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
700 result = result.replace('\0', '') or '_'
701
702 if not is_id:
703 while '__' in result:
704 result = result.replace('__', '_')
705 result = result.strip('_')
706 # Common case of "Foreign band name - English song title"
707 if restricted and result.startswith('-_'):
708 result = result[2:]
709 if result.startswith('-'):
710 result = '_' + result[len('-'):]
711 result = result.lstrip('.')
712 if not result:
713 result = '_'
714 return result
715
716
717 def sanitize_path(s, force=False):
718 """Sanitizes and normalizes path on Windows"""
719 if sys.platform == 'win32':
720 force = False
721 drive_or_unc, _ = os.path.splitdrive(s)
722 elif force:
723 drive_or_unc = ''
724 else:
725 return s
726
727 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
728 if drive_or_unc:
729 norm_path.pop(0)
730 sanitized_path = [
731 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
732 for path_part in norm_path]
733 if drive_or_unc:
734 sanitized_path.insert(0, drive_or_unc + os.path.sep)
735 elif force and s and s[0] == os.path.sep:
736 sanitized_path.insert(0, os.path.sep)
737 return os.path.join(*sanitized_path)
738
739
740 def sanitize_url(url, *, scheme='http'):
741 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
742 # the number of unwanted failures due to missing protocol
743 if url is None:
744 return
745 elif url.startswith('//'):
746 return f'{scheme}:{url}'
747 # Fix some common typos seen so far
748 COMMON_TYPOS = (
749 # https://github.com/ytdl-org/youtube-dl/issues/15649
750 (r'^httpss://', r'https://'),
751 # https://bx1.be/lives/direct-tv/
752 (r'^rmtp([es]?)://', r'rtmp\1://'),
753 )
754 for mistake, fixup in COMMON_TYPOS:
755 if re.match(mistake, url):
756 return re.sub(mistake, fixup, url)
757 return url
758
759
760 def extract_basic_auth(url):
761 parts = urllib.parse.urlsplit(url)
762 if parts.username is None:
763 return url, None
764 url = urllib.parse.urlunsplit(parts._replace(netloc=(
765 parts.hostname if parts.port is None
766 else '%s:%d' % (parts.hostname, parts.port))))
767 auth_payload = base64.b64encode(
768 ('%s:%s' % (parts.username, parts.password or '')).encode())
769 return url, f'Basic {auth_payload.decode()}'
770
771
772 def sanitized_Request(url, *args, **kwargs):
773 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
774 if auth_header is not None:
775 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
776 headers['Authorization'] = auth_header
777 return urllib.request.Request(url, *args, **kwargs)
778
779
780 def expand_path(s):
781 """Expand shell variables and ~"""
782 return os.path.expandvars(compat_expanduser(s))
783
784
785 def orderedSet(iterable, *, lazy=False):
786 """Remove all duplicates from the input iterable"""
787 def _iter():
788 seen = [] # Do not use set since the items can be unhashable
789 for x in iterable:
790 if x not in seen:
791 seen.append(x)
792 yield x
793
794 return _iter() if lazy else list(_iter())
795
796
797 def _htmlentity_transform(entity_with_semicolon):
798 """Transforms an HTML entity to a character."""
799 entity = entity_with_semicolon[:-1]
800
801 # Known non-numeric HTML entity
802 if entity in html.entities.name2codepoint:
803 return chr(html.entities.name2codepoint[entity])
804
805 # TODO: HTML5 allows entities without a semicolon.
806 # E.g. '&Eacuteric' should be decoded as 'Éric'.
807 if entity_with_semicolon in html.entities.html5:
808 return html.entities.html5[entity_with_semicolon]
809
810 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
811 if mobj is not None:
812 numstr = mobj.group(1)
813 if numstr.startswith('x'):
814 base = 16
815 numstr = '0%s' % numstr
816 else:
817 base = 10
818 # See https://github.com/ytdl-org/youtube-dl/issues/7518
819 with contextlib.suppress(ValueError):
820 return chr(int(numstr, base))
821
822 # Unknown entity in name, return its literal representation
823 return '&%s;' % entity
824
825
826 def unescapeHTML(s):
827 if s is None:
828 return None
829 assert isinstance(s, str)
830
831 return re.sub(
832 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
833
834
835 def escapeHTML(text):
836 return (
837 text
838 .replace('&', '&amp;')
839 .replace('<', '&lt;')
840 .replace('>', '&gt;')
841 .replace('"', '&quot;')
842 .replace("'", '&#39;')
843 )
844
845
846 def process_communicate_or_kill(p, *args, **kwargs):
847 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
848 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
849 return Popen.communicate_or_kill(p, *args, **kwargs)
850
851
852 class Popen(subprocess.Popen):
853 if sys.platform == 'win32':
854 _startupinfo = subprocess.STARTUPINFO()
855 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
856 else:
857 _startupinfo = None
858
859 @staticmethod
860 def _fix_pyinstaller_ld_path(env):
861 """Restore LD_LIBRARY_PATH when using PyInstaller
862 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
863 https://github.com/yt-dlp/yt-dlp/issues/4573
864 """
865 if not hasattr(sys, '_MEIPASS'):
866 return
867
868 def _fix(key):
869 orig = env.get(f'{key}_ORIG')
870 if orig is None:
871 env.pop(key, None)
872 else:
873 env[key] = orig
874
875 _fix('LD_LIBRARY_PATH') # Linux
876 _fix('DYLD_LIBRARY_PATH') # macOS
877
878 def __init__(self, *args, env=None, text=False, **kwargs):
879 if env is None:
880 env = os.environ.copy()
881 self._fix_pyinstaller_ld_path(env)
882
883 if text is True:
884 kwargs['universal_newlines'] = True # For 3.6 compatibility
885 kwargs.setdefault('encoding', 'utf-8')
886 kwargs.setdefault('errors', 'replace')
887 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
888
889 def communicate_or_kill(self, *args, **kwargs):
890 try:
891 return self.communicate(*args, **kwargs)
892 except BaseException: # Including KeyboardInterrupt
893 self.kill(timeout=None)
894 raise
895
896 def kill(self, *, timeout=0):
897 super().kill()
898 if timeout != 0:
899 self.wait(timeout=timeout)
900
901 @classmethod
902 def run(cls, *args, timeout=None, **kwargs):
903 with cls(*args, **kwargs) as proc:
904 default = '' if proc.text_mode else b''
905 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
906 return stdout or default, stderr or default, proc.returncode
907
908
909 def get_subprocess_encoding():
910 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
911 # For subprocess calls, encode with locale encoding
912 # Refer to http://stackoverflow.com/a/9951851/35070
913 encoding = preferredencoding()
914 else:
915 encoding = sys.getfilesystemencoding()
916 if encoding is None:
917 encoding = 'utf-8'
918 return encoding
919
920
921 def encodeFilename(s, for_subprocess=False):
922 assert isinstance(s, str)
923 return s
924
925
926 def decodeFilename(b, for_subprocess=False):
927 return b
928
929
930 def encodeArgument(s):
931 # Legacy code that uses byte strings
932 # Uncomment the following line after fixing all post processors
933 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
934 return s if isinstance(s, str) else s.decode('ascii')
935
936
937 def decodeArgument(b):
938 return b
939
940
941 def decodeOption(optval):
942 if optval is None:
943 return optval
944 if isinstance(optval, bytes):
945 optval = optval.decode(preferredencoding())
946
947 assert isinstance(optval, str)
948 return optval
949
950
951 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
952
953
954 def timetuple_from_msec(msec):
955 secs, msec = divmod(msec, 1000)
956 mins, secs = divmod(secs, 60)
957 hrs, mins = divmod(mins, 60)
958 return _timetuple(hrs, mins, secs, msec)
959
960
961 def formatSeconds(secs, delim=':', msec=False):
962 time = timetuple_from_msec(secs * 1000)
963 if time.hours:
964 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
965 elif time.minutes:
966 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
967 else:
968 ret = '%d' % time.seconds
969 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
970
971
972 def _ssl_load_windows_store_certs(ssl_context, storename):
973 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
974 try:
975 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
976 if encoding == 'x509_asn' and (
977 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
978 except PermissionError:
979 return
980 for cert in certs:
981 with contextlib.suppress(ssl.SSLError):
982 ssl_context.load_verify_locations(cadata=cert)
983
984
985 def make_HTTPS_handler(params, **kwargs):
986 opts_check_certificate = not params.get('nocheckcertificate')
987 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
988 context.check_hostname = opts_check_certificate
989 if params.get('legacyserverconnect'):
990 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
991 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
992 context.set_ciphers('DEFAULT')
993 elif (
994 sys.version_info < (3, 10)
995 and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1)
996 and not ssl.OPENSSL_VERSION.startswith('LibreSSL')
997 ):
998 # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
999 # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting
1000 # in some situations [2][3].
1001 # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
1002 # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
1003 # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
1004 # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536
1005 # 2. https://github.com/yt-dlp/yt-dlp/issues/4627
1006 # 3. https://github.com/yt-dlp/yt-dlp/pull/5294
1007 # 4. https://peps.python.org/pep-0644/
1008 # 5. https://peps.python.org/pep-0644/#libressl-support
1009 # 6. https://github.com/yt-dlp/yt-dlp/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368
1010 context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM')
1011 context.minimum_version = ssl.TLSVersion.TLSv1_2
1012
1013 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1014 if opts_check_certificate:
1015 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
1016 context.load_verify_locations(cafile=certifi.where())
1017 else:
1018 try:
1019 context.load_default_certs()
1020 # Work around the issue in load_default_certs when there are bad certificates. See:
1021 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1022 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1023 except ssl.SSLError:
1024 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1025 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1026 for storename in ('CA', 'ROOT'):
1027 _ssl_load_windows_store_certs(context, storename)
1028 context.set_default_verify_paths()
1029
1030 client_certfile = params.get('client_certificate')
1031 if client_certfile:
1032 try:
1033 context.load_cert_chain(
1034 client_certfile, keyfile=params.get('client_certificate_key'),
1035 password=params.get('client_certificate_password'))
1036 except ssl.SSLError:
1037 raise YoutubeDLError('Unable to load client certificate')
1038
1039 # Some servers may reject requests if ALPN extension is not sent. See:
1040 # https://github.com/python/cpython/issues/85140
1041 # https://github.com/yt-dlp/yt-dlp/issues/3878
1042 with contextlib.suppress(NotImplementedError):
1043 context.set_alpn_protocols(['http/1.1'])
1044
1045 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1046
1047
1048 def bug_reports_message(before=';'):
1049 from .update import REPOSITORY
1050
1051 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1052 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
1053
1054 before = before.rstrip()
1055 if not before or before.endswith(('.', '!', '?')):
1056 msg = msg[0].title() + msg[1:]
1057
1058 return (before + ' ' if before else '') + msg
1059
1060
1061 class YoutubeDLError(Exception):
1062 """Base exception for YoutubeDL errors."""
1063 msg = None
1064
1065 def __init__(self, msg=None):
1066 if msg is not None:
1067 self.msg = msg
1068 elif self.msg is None:
1069 self.msg = type(self).__name__
1070 super().__init__(self.msg)
1071
1072
1073 network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
1074 if hasattr(ssl, 'CertificateError'):
1075 network_exceptions.append(ssl.CertificateError)
1076 network_exceptions = tuple(network_exceptions)
1077
1078
1079 class ExtractorError(YoutubeDLError):
1080 """Error during info extraction."""
1081
1082 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1083 """ tb, if given, is the original traceback (so that it can be printed out).
1084 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1085 """
1086 if sys.exc_info()[0] in network_exceptions:
1087 expected = True
1088
1089 self.orig_msg = str(msg)
1090 self.traceback = tb
1091 self.expected = expected
1092 self.cause = cause
1093 self.video_id = video_id
1094 self.ie = ie
1095 self.exc_info = sys.exc_info() # preserve original exception
1096 if isinstance(self.exc_info[1], ExtractorError):
1097 self.exc_info = self.exc_info[1].exc_info
1098 super().__init__(self.__msg)
1099
1100 @property
1101 def __msg(self):
1102 return ''.join((
1103 format_field(self.ie, None, '[%s] '),
1104 format_field(self.video_id, None, '%s: '),
1105 self.orig_msg,
1106 format_field(self.cause, None, ' (caused by %r)'),
1107 '' if self.expected else bug_reports_message()))
1108
1109 def format_traceback(self):
1110 return join_nonempty(
1111 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1112 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1113 delim='\n') or None
1114
1115 def __setattr__(self, name, value):
1116 super().__setattr__(name, value)
1117 if getattr(self, 'msg', None) and name not in ('msg', 'args'):
1118 self.msg = self.__msg or type(self).__name__
1119 self.args = (self.msg, ) # Cannot be property
1120
1121
1122 class UnsupportedError(ExtractorError):
1123 def __init__(self, url):
1124 super().__init__(
1125 'Unsupported URL: %s' % url, expected=True)
1126 self.url = url
1127
1128
1129 class RegexNotFoundError(ExtractorError):
1130 """Error when a regex didn't match"""
1131 pass
1132
1133
1134 class GeoRestrictedError(ExtractorError):
1135 """Geographic restriction Error exception.
1136
1137 This exception may be thrown when a video is not available from your
1138 geographic location due to geographic restrictions imposed by a website.
1139 """
1140
1141 def __init__(self, msg, countries=None, **kwargs):
1142 kwargs['expected'] = True
1143 super().__init__(msg, **kwargs)
1144 self.countries = countries
1145
1146
1147 class UserNotLive(ExtractorError):
1148 """Error when a channel/user is not live"""
1149
1150 def __init__(self, msg=None, **kwargs):
1151 kwargs['expected'] = True
1152 super().__init__(msg or 'The channel is not currently live', **kwargs)
1153
1154
1155 class DownloadError(YoutubeDLError):
1156 """Download Error exception.
1157
1158 This exception may be thrown by FileDownloader objects if they are not
1159 configured to continue on errors. They will contain the appropriate
1160 error message.
1161 """
1162
1163 def __init__(self, msg, exc_info=None):
1164 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1165 super().__init__(msg)
1166 self.exc_info = exc_info
1167
1168
1169 class EntryNotInPlaylist(YoutubeDLError):
1170 """Entry not in playlist exception.
1171
1172 This exception will be thrown by YoutubeDL when a requested entry
1173 is not found in the playlist info_dict
1174 """
1175 msg = 'Entry not found in info'
1176
1177
1178 class SameFileError(YoutubeDLError):
1179 """Same File exception.
1180
1181 This exception will be thrown by FileDownloader objects if they detect
1182 multiple files would have to be downloaded to the same file on disk.
1183 """
1184 msg = 'Fixed output name but more than one file to download'
1185
1186 def __init__(self, filename=None):
1187 if filename is not None:
1188 self.msg += f': {filename}'
1189 super().__init__(self.msg)
1190
1191
1192 class PostProcessingError(YoutubeDLError):
1193 """Post Processing exception.
1194
1195 This exception may be raised by PostProcessor's .run() method to
1196 indicate an error in the postprocessing task.
1197 """
1198
1199
1200 class DownloadCancelled(YoutubeDLError):
1201 """ Exception raised when the download queue should be interrupted """
1202 msg = 'The download was cancelled'
1203
1204
1205 class ExistingVideoReached(DownloadCancelled):
1206 """ --break-on-existing triggered """
1207 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1208
1209
1210 class RejectedVideoReached(DownloadCancelled):
1211 """ --break-on-reject triggered """
1212 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1213
1214
1215 class MaxDownloadsReached(DownloadCancelled):
1216 """ --max-downloads limit has been reached. """
1217 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1218
1219
1220 class ReExtractInfo(YoutubeDLError):
1221 """ Video info needs to be re-extracted. """
1222
1223 def __init__(self, msg, expected=False):
1224 super().__init__(msg)
1225 self.expected = expected
1226
1227
1228 class ThrottledDownload(ReExtractInfo):
1229 """ Download speed below --throttled-rate. """
1230 msg = 'The download speed is below throttle limit'
1231
1232 def __init__(self):
1233 super().__init__(self.msg, expected=False)
1234
1235
1236 class UnavailableVideoError(YoutubeDLError):
1237 """Unavailable Format exception.
1238
1239 This exception will be thrown when a video is requested
1240 in a format that is not available for that video.
1241 """
1242 msg = 'Unable to download video'
1243
1244 def __init__(self, err=None):
1245 if err is not None:
1246 self.msg += f': {err}'
1247 super().__init__(self.msg)
1248
1249
1250 class ContentTooShortError(YoutubeDLError):
1251 """Content Too Short exception.
1252
1253 This exception may be raised by FileDownloader objects when a file they
1254 download is too small for what the server announced first, indicating
1255 the connection was probably interrupted.
1256 """
1257
1258 def __init__(self, downloaded, expected):
1259 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1260 # Both in bytes
1261 self.downloaded = downloaded
1262 self.expected = expected
1263
1264
1265 class XAttrMetadataError(YoutubeDLError):
1266 def __init__(self, code=None, msg='Unknown error'):
1267 super().__init__(msg)
1268 self.code = code
1269 self.msg = msg
1270
1271 # Parsing code and msg
1272 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1273 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1274 self.reason = 'NO_SPACE'
1275 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1276 self.reason = 'VALUE_TOO_LONG'
1277 else:
1278 self.reason = 'NOT_SUPPORTED'
1279
1280
1281 class XAttrUnavailableError(YoutubeDLError):
1282 pass
1283
1284
1285 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1286 hc = http_class(*args, **kwargs)
1287 source_address = ydl_handler._params.get('source_address')
1288
1289 if source_address is not None:
1290 # This is to workaround _create_connection() from socket where it will try all
1291 # address data from getaddrinfo() including IPv6. This filters the result from
1292 # getaddrinfo() based on the source_address value.
1293 # This is based on the cpython socket.create_connection() function.
1294 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1295 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1296 host, port = address
1297 err = None
1298 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1299 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1300 ip_addrs = [addr for addr in addrs if addr[0] == af]
1301 if addrs and not ip_addrs:
1302 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1303 raise OSError(
1304 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1305 % (ip_version, source_address[0]))
1306 for res in ip_addrs:
1307 af, socktype, proto, canonname, sa = res
1308 sock = None
1309 try:
1310 sock = socket.socket(af, socktype, proto)
1311 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1312 sock.settimeout(timeout)
1313 sock.bind(source_address)
1314 sock.connect(sa)
1315 err = None # Explicitly break reference cycle
1316 return sock
1317 except OSError as _:
1318 err = _
1319 if sock is not None:
1320 sock.close()
1321 if err is not None:
1322 raise err
1323 else:
1324 raise OSError('getaddrinfo returns an empty list')
1325 if hasattr(hc, '_create_connection'):
1326 hc._create_connection = _create_connection
1327 hc.source_address = (source_address, 0)
1328
1329 return hc
1330
1331
1332 def handle_youtubedl_headers(headers):
1333 filtered_headers = headers
1334
1335 if 'Youtubedl-no-compression' in filtered_headers:
1336 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1337 del filtered_headers['Youtubedl-no-compression']
1338
1339 return filtered_headers
1340
1341
1342 class YoutubeDLHandler(urllib.request.HTTPHandler):
1343 """Handler for HTTP requests and responses.
1344
1345 This class, when installed with an OpenerDirector, automatically adds
1346 the standard headers to every HTTP request and handles gzipped and
1347 deflated responses from web servers. If compression is to be avoided in
1348 a particular request, the original request in the program code only has
1349 to include the HTTP header "Youtubedl-no-compression", which will be
1350 removed before making the real request.
1351
1352 Part of this code was copied from:
1353
1354 http://techknack.net/python-urllib2-handlers/
1355
1356 Andrew Rowls, the author of that code, agreed to release it to the
1357 public domain.
1358 """
1359
1360 def __init__(self, params, *args, **kwargs):
1361 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
1362 self._params = params
1363
1364 def http_open(self, req):
1365 conn_class = http.client.HTTPConnection
1366
1367 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1368 if socks_proxy:
1369 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1370 del req.headers['Ytdl-socks-proxy']
1371
1372 return self.do_open(functools.partial(
1373 _create_http_connection, self, conn_class, False),
1374 req)
1375
1376 @staticmethod
1377 def deflate(data):
1378 if not data:
1379 return data
1380 try:
1381 return zlib.decompress(data, -zlib.MAX_WBITS)
1382 except zlib.error:
1383 return zlib.decompress(data)
1384
1385 @staticmethod
1386 def brotli(data):
1387 if not data:
1388 return data
1389 return brotli.decompress(data)
1390
1391 def http_request(self, req):
1392 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1393 # always respected by websites, some tend to give out URLs with non percent-encoded
1394 # non-ASCII characters (see telemb.py, ard.py [#3412])
1395 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1396 # To work around aforementioned issue we will replace request's original URL with
1397 # percent-encoded one
1398 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1399 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1400 url = req.get_full_url()
1401 url_escaped = escape_url(url)
1402
1403 # Substitute URL if any change after escaping
1404 if url != url_escaped:
1405 req = update_Request(req, url=url_escaped)
1406
1407 for h, v in self._params.get('http_headers', std_headers).items():
1408 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1409 # The dict keys are capitalized because of this bug by urllib
1410 if h.capitalize() not in req.headers:
1411 req.add_header(h, v)
1412
1413 if 'Accept-encoding' not in req.headers:
1414 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1415
1416 req.headers = handle_youtubedl_headers(req.headers)
1417
1418 return super().do_request_(req)
1419
1420 def http_response(self, req, resp):
1421 old_resp = resp
1422 # gzip
1423 if resp.headers.get('Content-encoding', '') == 'gzip':
1424 content = resp.read()
1425 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1426 try:
1427 uncompressed = io.BytesIO(gz.read())
1428 except OSError as original_ioerror:
1429 # There may be junk add the end of the file
1430 # See http://stackoverflow.com/q/4928560/35070 for details
1431 for i in range(1, 1024):
1432 try:
1433 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1434 uncompressed = io.BytesIO(gz.read())
1435 except OSError:
1436 continue
1437 break
1438 else:
1439 raise original_ioerror
1440 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1441 resp.msg = old_resp.msg
1442 del resp.headers['Content-encoding']
1443 # deflate
1444 if resp.headers.get('Content-encoding', '') == 'deflate':
1445 gz = io.BytesIO(self.deflate(resp.read()))
1446 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1447 resp.msg = old_resp.msg
1448 del resp.headers['Content-encoding']
1449 # brotli
1450 if resp.headers.get('Content-encoding', '') == 'br':
1451 resp = urllib.request.addinfourl(
1452 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1453 resp.msg = old_resp.msg
1454 del resp.headers['Content-encoding']
1455 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1456 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1457 if 300 <= resp.code < 400:
1458 location = resp.headers.get('Location')
1459 if location:
1460 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1461 location = location.encode('iso-8859-1').decode()
1462 location_escaped = escape_url(location)
1463 if location != location_escaped:
1464 del resp.headers['Location']
1465 resp.headers['Location'] = location_escaped
1466 return resp
1467
1468 https_request = http_request
1469 https_response = http_response
1470
1471
1472 def make_socks_conn_class(base_class, socks_proxy):
1473 assert issubclass(base_class, (
1474 http.client.HTTPConnection, http.client.HTTPSConnection))
1475
1476 url_components = urllib.parse.urlparse(socks_proxy)
1477 if url_components.scheme.lower() == 'socks5':
1478 socks_type = ProxyType.SOCKS5
1479 elif url_components.scheme.lower() in ('socks', 'socks4'):
1480 socks_type = ProxyType.SOCKS4
1481 elif url_components.scheme.lower() == 'socks4a':
1482 socks_type = ProxyType.SOCKS4A
1483
1484 def unquote_if_non_empty(s):
1485 if not s:
1486 return s
1487 return urllib.parse.unquote_plus(s)
1488
1489 proxy_args = (
1490 socks_type,
1491 url_components.hostname, url_components.port or 1080,
1492 True, # Remote DNS
1493 unquote_if_non_empty(url_components.username),
1494 unquote_if_non_empty(url_components.password),
1495 )
1496
1497 class SocksConnection(base_class):
1498 def connect(self):
1499 self.sock = sockssocket()
1500 self.sock.setproxy(*proxy_args)
1501 if isinstance(self.timeout, (int, float)):
1502 self.sock.settimeout(self.timeout)
1503 self.sock.connect((self.host, self.port))
1504
1505 if isinstance(self, http.client.HTTPSConnection):
1506 if hasattr(self, '_context'): # Python > 2.6
1507 self.sock = self._context.wrap_socket(
1508 self.sock, server_hostname=self.host)
1509 else:
1510 self.sock = ssl.wrap_socket(self.sock)
1511
1512 return SocksConnection
1513
1514
1515 class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
1516 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1517 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1518 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
1519 self._params = params
1520
1521 def https_open(self, req):
1522 kwargs = {}
1523 conn_class = self._https_conn_class
1524
1525 if hasattr(self, '_context'): # python > 2.6
1526 kwargs['context'] = self._context
1527 if hasattr(self, '_check_hostname'): # python 3.x
1528 kwargs['check_hostname'] = self._check_hostname
1529
1530 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1531 if socks_proxy:
1532 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1533 del req.headers['Ytdl-socks-proxy']
1534
1535 try:
1536 return self.do_open(
1537 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1538 except urllib.error.URLError as e:
1539 if (isinstance(e.reason, ssl.SSLError)
1540 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1541 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1542 raise
1543
1544
1545 def is_path_like(f):
1546 return isinstance(f, (str, bytes, os.PathLike))
1547
1548
1549 class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
1550 """
1551 See [1] for cookie file format.
1552
1553 1. https://curl.haxx.se/docs/http-cookies.html
1554 """
1555 _HTTPONLY_PREFIX = '#HttpOnly_'
1556 _ENTRY_LEN = 7
1557 _HEADER = '''# Netscape HTTP Cookie File
1558 # This file is generated by yt-dlp. Do not edit.
1559
1560 '''
1561 _CookieFileEntry = collections.namedtuple(
1562 'CookieFileEntry',
1563 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1564
1565 def __init__(self, filename=None, *args, **kwargs):
1566 super().__init__(None, *args, **kwargs)
1567 if is_path_like(filename):
1568 filename = os.fspath(filename)
1569 self.filename = filename
1570
1571 @staticmethod
1572 def _true_or_false(cndn):
1573 return 'TRUE' if cndn else 'FALSE'
1574
1575 @contextlib.contextmanager
1576 def open(self, file, *, write=False):
1577 if is_path_like(file):
1578 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1579 yield f
1580 else:
1581 if write:
1582 file.truncate(0)
1583 yield file
1584
1585 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1586 now = time.time()
1587 for cookie in self:
1588 if (not ignore_discard and cookie.discard
1589 or not ignore_expires and cookie.is_expired(now)):
1590 continue
1591 name, value = cookie.name, cookie.value
1592 if value is None:
1593 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1594 # with no name, whereas http.cookiejar regards it as a
1595 # cookie with no value.
1596 name, value = '', name
1597 f.write('%s\n' % '\t'.join((
1598 cookie.domain,
1599 self._true_or_false(cookie.domain.startswith('.')),
1600 cookie.path,
1601 self._true_or_false(cookie.secure),
1602 str_or_none(cookie.expires, default=''),
1603 name, value
1604 )))
1605
1606 def save(self, filename=None, *args, **kwargs):
1607 """
1608 Save cookies to a file.
1609 Code is taken from CPython 3.6
1610 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
1611
1612 if filename is None:
1613 if self.filename is not None:
1614 filename = self.filename
1615 else:
1616 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1617
1618 # Store session cookies with `expires` set to 0 instead of an empty string
1619 for cookie in self:
1620 if cookie.expires is None:
1621 cookie.expires = 0
1622
1623 with self.open(filename, write=True) as f:
1624 f.write(self._HEADER)
1625 self._really_save(f, *args, **kwargs)
1626
1627 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1628 """Load cookies from a file."""
1629 if filename is None:
1630 if self.filename is not None:
1631 filename = self.filename
1632 else:
1633 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
1634
1635 def prepare_line(line):
1636 if line.startswith(self._HTTPONLY_PREFIX):
1637 line = line[len(self._HTTPONLY_PREFIX):]
1638 # comments and empty lines are fine
1639 if line.startswith('#') or not line.strip():
1640 return line
1641 cookie_list = line.split('\t')
1642 if len(cookie_list) != self._ENTRY_LEN:
1643 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
1644 cookie = self._CookieFileEntry(*cookie_list)
1645 if cookie.expires_at and not cookie.expires_at.isdigit():
1646 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1647 return line
1648
1649 cf = io.StringIO()
1650 with self.open(filename) as f:
1651 for line in f:
1652 try:
1653 cf.write(prepare_line(line))
1654 except http.cookiejar.LoadError as e:
1655 if f'{line.strip()} '[0] in '[{"':
1656 raise http.cookiejar.LoadError(
1657 'Cookies file must be Netscape formatted, not JSON. See '
1658 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
1659 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1660 continue
1661 cf.seek(0)
1662 self._really_load(cf, filename, ignore_discard, ignore_expires)
1663 # Session cookies are denoted by either `expires` field set to
1664 # an empty string or 0. MozillaCookieJar only recognizes the former
1665 # (see [1]). So we need force the latter to be recognized as session
1666 # cookies on our own.
1667 # Session cookies may be important for cookies-based authentication,
1668 # e.g. usually, when user does not check 'Remember me' check box while
1669 # logging in on a site, some important cookies are stored as session
1670 # cookies so that not recognizing them will result in failed login.
1671 # 1. https://bugs.python.org/issue17164
1672 for cookie in self:
1673 # Treat `expires=0` cookies as session cookies
1674 if cookie.expires == 0:
1675 cookie.expires = None
1676 cookie.discard = True
1677
1678
1679 class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
1680 def __init__(self, cookiejar=None):
1681 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
1682
1683 def http_response(self, request, response):
1684 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
1685
1686 https_request = urllib.request.HTTPCookieProcessor.http_request
1687 https_response = http_response
1688
1689
1690 class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
1691 """YoutubeDL redirect handler
1692
1693 The code is based on HTTPRedirectHandler implementation from CPython [1].
1694
1695 This redirect handler solves two issues:
1696 - ensures redirect URL is always unicode under python 2
1697 - introduces support for experimental HTTP response status code
1698 308 Permanent Redirect [2] used by some sites [3]
1699
1700 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1701 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1702 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1703 """
1704
1705 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
1706
1707 def redirect_request(self, req, fp, code, msg, headers, newurl):
1708 """Return a Request or None in response to a redirect.
1709
1710 This is called by the http_error_30x methods when a
1711 redirection response is received. If a redirection should
1712 take place, return a new Request to allow http_error_30x to
1713 perform the redirect. Otherwise, raise HTTPError if no-one
1714 else should try to handle this url. Return None if you can't
1715 but another Handler might.
1716 """
1717 m = req.get_method()
1718 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1719 or code in (301, 302, 303) and m == "POST")):
1720 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
1721 # Strictly (according to RFC 2616), 301 or 302 in response to
1722 # a POST MUST NOT cause a redirection without confirmation
1723 # from the user (of urllib.request, in this case). In practice,
1724 # essentially all clients do redirect in this case, so we do
1725 # the same.
1726
1727 # Be conciliant with URIs containing a space. This is mainly
1728 # redundant with the more complete encoding done in http_error_302(),
1729 # but it is kept for compatibility with other callers.
1730 newurl = newurl.replace(' ', '%20')
1731
1732 CONTENT_HEADERS = ("content-length", "content-type")
1733 # NB: don't use dict comprehension for python 2.6 compatibility
1734 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1735
1736 # A 303 must either use GET or HEAD for subsequent request
1737 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1738 if code == 303 and m != 'HEAD':
1739 m = 'GET'
1740 # 301 and 302 redirects are commonly turned into a GET from a POST
1741 # for subsequent requests by browsers, so we'll do the same.
1742 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1743 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1744 if code in (301, 302) and m == 'POST':
1745 m = 'GET'
1746
1747 return urllib.request.Request(
1748 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1749 unverifiable=True, method=m)
1750
1751
1752 def extract_timezone(date_str):
1753 m = re.search(
1754 r'''(?x)
1755 ^.{8,}? # >=8 char non-TZ prefix, if present
1756 (?P<tz>Z| # just the UTC Z, or
1757 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1758 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1759 [ ]? # optional space
1760 (?P<sign>\+|-) # +/-
1761 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1762 $)
1763 ''', date_str)
1764 if not m:
1765 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1766 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1767 if timezone is not None:
1768 date_str = date_str[:-len(m.group('tz'))]
1769 timezone = datetime.timedelta(hours=timezone or 0)
1770 else:
1771 date_str = date_str[:-len(m.group('tz'))]
1772 if not m.group('sign'):
1773 timezone = datetime.timedelta()
1774 else:
1775 sign = 1 if m.group('sign') == '+' else -1
1776 timezone = datetime.timedelta(
1777 hours=sign * int(m.group('hours')),
1778 minutes=sign * int(m.group('minutes')))
1779 return timezone, date_str
1780
1781
1782 def parse_iso8601(date_str, delimiter='T', timezone=None):
1783 """ Return a UNIX timestamp from the given date """
1784
1785 if date_str is None:
1786 return None
1787
1788 date_str = re.sub(r'\.[0-9]+', '', date_str)
1789
1790 if timezone is None:
1791 timezone, date_str = extract_timezone(date_str)
1792
1793 with contextlib.suppress(ValueError):
1794 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1795 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1796 return calendar.timegm(dt.timetuple())
1797
1798
1799 def date_formats(day_first=True):
1800 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1801
1802
1803 def unified_strdate(date_str, day_first=True):
1804 """Return a string with the date in the format YYYYMMDD"""
1805
1806 if date_str is None:
1807 return None
1808 upload_date = None
1809 # Replace commas
1810 date_str = date_str.replace(',', ' ')
1811 # Remove AM/PM + timezone
1812 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1813 _, date_str = extract_timezone(date_str)
1814
1815 for expression in date_formats(day_first):
1816 with contextlib.suppress(ValueError):
1817 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1818 if upload_date is None:
1819 timetuple = email.utils.parsedate_tz(date_str)
1820 if timetuple:
1821 with contextlib.suppress(ValueError):
1822 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1823 if upload_date is not None:
1824 return str(upload_date)
1825
1826
1827 def unified_timestamp(date_str, day_first=True):
1828 if date_str is None:
1829 return None
1830
1831 date_str = re.sub(r'\s+', ' ', re.sub(
1832 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
1833
1834 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1835 timezone, date_str = extract_timezone(date_str)
1836
1837 # Remove AM/PM + timezone
1838 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1839
1840 # Remove unrecognized timezones from ISO 8601 alike timestamps
1841 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1842 if m:
1843 date_str = date_str[:-len(m.group('tz'))]
1844
1845 # Python only supports microseconds, so remove nanoseconds
1846 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1847 if m:
1848 date_str = m.group(1)
1849
1850 for expression in date_formats(day_first):
1851 with contextlib.suppress(ValueError):
1852 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1853 return calendar.timegm(dt.timetuple())
1854
1855 timetuple = email.utils.parsedate_tz(date_str)
1856 if timetuple:
1857 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
1858
1859
1860 def determine_ext(url, default_ext='unknown_video'):
1861 if url is None or '.' not in url:
1862 return default_ext
1863 guess = url.partition('?')[0].rpartition('.')[2]
1864 if re.match(r'^[A-Za-z0-9]+$', guess):
1865 return guess
1866 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1867 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1868 return guess.rstrip('/')
1869 else:
1870 return default_ext
1871
1872
1873 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1874 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1875
1876
1877 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1878 R"""
1879 Return a datetime object from a string.
1880 Supported format:
1881 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1882
1883 @param format strftime format of DATE
1884 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1885 auto: round to the unit provided in date_str (if applicable).
1886 """
1887 auto_precision = False
1888 if precision == 'auto':
1889 auto_precision = True
1890 precision = 'microsecond'
1891 today = datetime_round(datetime.datetime.utcnow(), precision)
1892 if date_str in ('now', 'today'):
1893 return today
1894 if date_str == 'yesterday':
1895 return today - datetime.timedelta(days=1)
1896 match = re.match(
1897 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
1898 date_str)
1899 if match is not None:
1900 start_time = datetime_from_str(match.group('start'), precision, format)
1901 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1902 unit = match.group('unit')
1903 if unit == 'month' or unit == 'year':
1904 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1905 unit = 'day'
1906 else:
1907 if unit == 'week':
1908 unit = 'day'
1909 time *= 7
1910 delta = datetime.timedelta(**{unit + 's': time})
1911 new_date = start_time + delta
1912 if auto_precision:
1913 return datetime_round(new_date, unit)
1914 return new_date
1915
1916 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1917
1918
1919 def date_from_str(date_str, format='%Y%m%d', strict=False):
1920 R"""
1921 Return a date object from a string using datetime_from_str
1922
1923 @param strict Restrict allowed patterns to "YYYYMMDD" and
1924 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
1925 """
1926 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1927 raise ValueError(f'Invalid date format "{date_str}"')
1928 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1929
1930
1931 def datetime_add_months(dt, months):
1932 """Increment/Decrement a datetime object by months."""
1933 month = dt.month + months - 1
1934 year = dt.year + month // 12
1935 month = month % 12 + 1
1936 day = min(dt.day, calendar.monthrange(year, month)[1])
1937 return dt.replace(year, month, day)
1938
1939
1940 def datetime_round(dt, precision='day'):
1941 """
1942 Round a datetime object's time to a specific precision
1943 """
1944 if precision == 'microsecond':
1945 return dt
1946
1947 unit_seconds = {
1948 'day': 86400,
1949 'hour': 3600,
1950 'minute': 60,
1951 'second': 1,
1952 }
1953 roundto = lambda x, n: ((x + n / 2) // n) * n
1954 timestamp = calendar.timegm(dt.timetuple())
1955 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1956
1957
1958 def hyphenate_date(date_str):
1959 """
1960 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1961 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1962 if match is not None:
1963 return '-'.join(match.groups())
1964 else:
1965 return date_str
1966
1967
1968 class DateRange:
1969 """Represents a time interval between two dates"""
1970
1971 def __init__(self, start=None, end=None):
1972 """start and end must be strings in the format accepted by date"""
1973 if start is not None:
1974 self.start = date_from_str(start, strict=True)
1975 else:
1976 self.start = datetime.datetime.min.date()
1977 if end is not None:
1978 self.end = date_from_str(end, strict=True)
1979 else:
1980 self.end = datetime.datetime.max.date()
1981 if self.start > self.end:
1982 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1983
1984 @classmethod
1985 def day(cls, day):
1986 """Returns a range that only contains the given day"""
1987 return cls(day, day)
1988
1989 def __contains__(self, date):
1990 """Check if the date is in the range"""
1991 if not isinstance(date, datetime.date):
1992 date = date_from_str(date)
1993 return self.start <= date <= self.end
1994
1995 def __str__(self):
1996 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1997
1998 def __eq__(self, other):
1999 return (isinstance(other, DateRange)
2000 and self.start == other.start and self.end == other.end)
2001
2002
2003 def platform_name():
2004 """ Returns the platform name as a str """
2005 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
2006 return platform.platform()
2007
2008
2009 @functools.cache
2010 def system_identifier():
2011 python_implementation = platform.python_implementation()
2012 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
2013 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
2014 libc_ver = []
2015 with contextlib.suppress(OSError): # We may not have access to the executable
2016 libc_ver = platform.libc_ver()
2017
2018 return 'Python %s (%s %s %s) - %s (%s%s)' % (
2019 platform.python_version(),
2020 python_implementation,
2021 platform.machine(),
2022 platform.architecture()[0],
2023 platform.platform(),
2024 ssl.OPENSSL_VERSION,
2025 format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'),
2026 )
2027
2028
2029 @functools.cache
2030 def get_windows_version():
2031 ''' Get Windows version. returns () if it's not running on Windows '''
2032 if compat_os_name == 'nt':
2033 return version_tuple(platform.win32_ver()[1])
2034 else:
2035 return ()
2036
2037
2038 def write_string(s, out=None, encoding=None):
2039 assert isinstance(s, str)
2040 out = out or sys.stderr
2041
2042 if compat_os_name == 'nt' and supports_terminal_sequences(out):
2043 s = re.sub(r'([\r\n]+)', r' \1', s)
2044
2045 enc, buffer = None, out
2046 if 'b' in getattr(out, 'mode', ''):
2047 enc = encoding or preferredencoding()
2048 elif hasattr(out, 'buffer'):
2049 buffer = out.buffer
2050 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2051
2052 buffer.write(s.encode(enc, 'ignore') if enc else s)
2053 out.flush()
2054
2055
2056 def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2057 from . import _IN_CLI
2058 if _IN_CLI:
2059 if msg in deprecation_warning._cache:
2060 return
2061 deprecation_warning._cache.add(msg)
2062 if printer:
2063 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2064 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2065 else:
2066 import warnings
2067 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2068
2069
2070 deprecation_warning._cache = set()
2071
2072
2073 def bytes_to_intlist(bs):
2074 if not bs:
2075 return []
2076 if isinstance(bs[0], int): # Python 3
2077 return list(bs)
2078 else:
2079 return [ord(c) for c in bs]
2080
2081
2082 def intlist_to_bytes(xs):
2083 if not xs:
2084 return b''
2085 return struct.pack('%dB' % len(xs), *xs)
2086
2087
2088 class LockingUnsupportedError(OSError):
2089 msg = 'File locking is not supported'
2090
2091 def __init__(self):
2092 super().__init__(self.msg)
2093
2094
2095 # Cross-platform file locking
2096 if sys.platform == 'win32':
2097 import ctypes
2098 import ctypes.wintypes
2099 import msvcrt
2100
2101 class OVERLAPPED(ctypes.Structure):
2102 _fields_ = [
2103 ('Internal', ctypes.wintypes.LPVOID),
2104 ('InternalHigh', ctypes.wintypes.LPVOID),
2105 ('Offset', ctypes.wintypes.DWORD),
2106 ('OffsetHigh', ctypes.wintypes.DWORD),
2107 ('hEvent', ctypes.wintypes.HANDLE),
2108 ]
2109
2110 kernel32 = ctypes.windll.kernel32
2111 LockFileEx = kernel32.LockFileEx
2112 LockFileEx.argtypes = [
2113 ctypes.wintypes.HANDLE, # hFile
2114 ctypes.wintypes.DWORD, # dwFlags
2115 ctypes.wintypes.DWORD, # dwReserved
2116 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2117 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2118 ctypes.POINTER(OVERLAPPED) # Overlapped
2119 ]
2120 LockFileEx.restype = ctypes.wintypes.BOOL
2121 UnlockFileEx = kernel32.UnlockFileEx
2122 UnlockFileEx.argtypes = [
2123 ctypes.wintypes.HANDLE, # hFile
2124 ctypes.wintypes.DWORD, # dwReserved
2125 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2126 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2127 ctypes.POINTER(OVERLAPPED) # Overlapped
2128 ]
2129 UnlockFileEx.restype = ctypes.wintypes.BOOL
2130 whole_low = 0xffffffff
2131 whole_high = 0x7fffffff
2132
2133 def _lock_file(f, exclusive, block):
2134 overlapped = OVERLAPPED()
2135 overlapped.Offset = 0
2136 overlapped.OffsetHigh = 0
2137 overlapped.hEvent = 0
2138 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2139
2140 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2141 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2142 0, whole_low, whole_high, f._lock_file_overlapped_p):
2143 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2144 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
2145
2146 def _unlock_file(f):
2147 assert f._lock_file_overlapped_p
2148 handle = msvcrt.get_osfhandle(f.fileno())
2149 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2150 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2151
2152 else:
2153 try:
2154 import fcntl
2155
2156 def _lock_file(f, exclusive, block):
2157 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2158 if not block:
2159 flags |= fcntl.LOCK_NB
2160 try:
2161 fcntl.flock(f, flags)
2162 except BlockingIOError:
2163 raise
2164 except OSError: # AOSP does not have flock()
2165 fcntl.lockf(f, flags)
2166
2167 def _unlock_file(f):
2168 try:
2169 fcntl.flock(f, fcntl.LOCK_UN)
2170 except OSError:
2171 fcntl.lockf(f, fcntl.LOCK_UN)
2172
2173 except ImportError:
2174
2175 def _lock_file(f, exclusive, block):
2176 raise LockingUnsupportedError()
2177
2178 def _unlock_file(f):
2179 raise LockingUnsupportedError()
2180
2181
2182 class locked_file:
2183 locked = False
2184
2185 def __init__(self, filename, mode, block=True, encoding=None):
2186 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2187 raise NotImplementedError(mode)
2188 self.mode, self.block = mode, block
2189
2190 writable = any(f in mode for f in 'wax+')
2191 readable = any(f in mode for f in 'r+')
2192 flags = functools.reduce(operator.ior, (
2193 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2194 getattr(os, 'O_BINARY', 0), # Windows only
2195 getattr(os, 'O_NOINHERIT', 0), # Windows only
2196 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2197 os.O_APPEND if 'a' in mode else 0,
2198 os.O_EXCL if 'x' in mode else 0,
2199 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2200 ))
2201
2202 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2203
2204 def __enter__(self):
2205 exclusive = 'r' not in self.mode
2206 try:
2207 _lock_file(self.f, exclusive, self.block)
2208 self.locked = True
2209 except OSError:
2210 self.f.close()
2211 raise
2212 if 'w' in self.mode:
2213 try:
2214 self.f.truncate()
2215 except OSError as e:
2216 if e.errno not in (
2217 errno.ESPIPE, # Illegal seek - expected for FIFO
2218 errno.EINVAL, # Invalid argument - expected for /dev/null
2219 ):
2220 raise
2221 return self
2222
2223 def unlock(self):
2224 if not self.locked:
2225 return
2226 try:
2227 _unlock_file(self.f)
2228 finally:
2229 self.locked = False
2230
2231 def __exit__(self, *_):
2232 try:
2233 self.unlock()
2234 finally:
2235 self.f.close()
2236
2237 open = __enter__
2238 close = __exit__
2239
2240 def __getattr__(self, attr):
2241 return getattr(self.f, attr)
2242
2243 def __iter__(self):
2244 return iter(self.f)
2245
2246
2247 @functools.cache
2248 def get_filesystem_encoding():
2249 encoding = sys.getfilesystemencoding()
2250 return encoding if encoding is not None else 'utf-8'
2251
2252
2253 def shell_quote(args):
2254 quoted_args = []
2255 encoding = get_filesystem_encoding()
2256 for a in args:
2257 if isinstance(a, bytes):
2258 # We may get a filename encoded with 'encodeFilename'
2259 a = a.decode(encoding)
2260 quoted_args.append(compat_shlex_quote(a))
2261 return ' '.join(quoted_args)
2262
2263
2264 def smuggle_url(url, data):
2265 """ Pass additional data in a URL for internal use. """
2266
2267 url, idata = unsmuggle_url(url, {})
2268 data.update(idata)
2269 sdata = urllib.parse.urlencode(
2270 {'__youtubedl_smuggle': json.dumps(data)})
2271 return url + '#' + sdata
2272
2273
2274 def unsmuggle_url(smug_url, default=None):
2275 if '#__youtubedl_smuggle' not in smug_url:
2276 return smug_url, default
2277 url, _, sdata = smug_url.rpartition('#')
2278 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
2279 data = json.loads(jsond)
2280 return url, data
2281
2282
2283 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2284 """ Formats numbers with decimal sufixes like K, M, etc """
2285 num, factor = float_or_none(num), float(factor)
2286 if num is None or num < 0:
2287 return None
2288 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2289 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2290 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2291 if factor == 1024:
2292 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2293 converted = num / (factor ** exponent)
2294 return fmt % (converted, suffix)
2295
2296
2297 def format_bytes(bytes):
2298 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2299
2300
2301 def lookup_unit_table(unit_table, s, strict=False):
2302 num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]')
2303 units_re = '|'.join(re.escape(u) for u in unit_table)
2304 m = (re.fullmatch if strict else re.match)(
2305 rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s)
2306 if not m:
2307 return None
2308
2309 num = float(m.group('num').replace(',', '.'))
2310 mult = unit_table[m.group('unit')]
2311 return round(num * mult)
2312
2313
2314 def parse_bytes(s):
2315 """Parse a string indicating a byte quantity into an integer"""
2316 return lookup_unit_table(
2317 {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])},
2318 s.upper(), strict=True)
2319
2320
2321 def parse_filesize(s):
2322 if s is None:
2323 return None
2324
2325 # The lower-case forms are of course incorrect and unofficial,
2326 # but we support those too
2327 _UNIT_TABLE = {
2328 'B': 1,
2329 'b': 1,
2330 'bytes': 1,
2331 'KiB': 1024,
2332 'KB': 1000,
2333 'kB': 1024,
2334 'Kb': 1000,
2335 'kb': 1000,
2336 'kilobytes': 1000,
2337 'kibibytes': 1024,
2338 'MiB': 1024 ** 2,
2339 'MB': 1000 ** 2,
2340 'mB': 1024 ** 2,
2341 'Mb': 1000 ** 2,
2342 'mb': 1000 ** 2,
2343 'megabytes': 1000 ** 2,
2344 'mebibytes': 1024 ** 2,
2345 'GiB': 1024 ** 3,
2346 'GB': 1000 ** 3,
2347 'gB': 1024 ** 3,
2348 'Gb': 1000 ** 3,
2349 'gb': 1000 ** 3,
2350 'gigabytes': 1000 ** 3,
2351 'gibibytes': 1024 ** 3,
2352 'TiB': 1024 ** 4,
2353 'TB': 1000 ** 4,
2354 'tB': 1024 ** 4,
2355 'Tb': 1000 ** 4,
2356 'tb': 1000 ** 4,
2357 'terabytes': 1000 ** 4,
2358 'tebibytes': 1024 ** 4,
2359 'PiB': 1024 ** 5,
2360 'PB': 1000 ** 5,
2361 'pB': 1024 ** 5,
2362 'Pb': 1000 ** 5,
2363 'pb': 1000 ** 5,
2364 'petabytes': 1000 ** 5,
2365 'pebibytes': 1024 ** 5,
2366 'EiB': 1024 ** 6,
2367 'EB': 1000 ** 6,
2368 'eB': 1024 ** 6,
2369 'Eb': 1000 ** 6,
2370 'eb': 1000 ** 6,
2371 'exabytes': 1000 ** 6,
2372 'exbibytes': 1024 ** 6,
2373 'ZiB': 1024 ** 7,
2374 'ZB': 1000 ** 7,
2375 'zB': 1024 ** 7,
2376 'Zb': 1000 ** 7,
2377 'zb': 1000 ** 7,
2378 'zettabytes': 1000 ** 7,
2379 'zebibytes': 1024 ** 7,
2380 'YiB': 1024 ** 8,
2381 'YB': 1000 ** 8,
2382 'yB': 1024 ** 8,
2383 'Yb': 1000 ** 8,
2384 'yb': 1000 ** 8,
2385 'yottabytes': 1000 ** 8,
2386 'yobibytes': 1024 ** 8,
2387 }
2388
2389 return lookup_unit_table(_UNIT_TABLE, s)
2390
2391
2392 def parse_count(s):
2393 if s is None:
2394 return None
2395
2396 s = re.sub(r'^[^\d]+\s', '', s).strip()
2397
2398 if re.match(r'^[\d,.]+$', s):
2399 return str_to_int(s)
2400
2401 _UNIT_TABLE = {
2402 'k': 1000,
2403 'K': 1000,
2404 'm': 1000 ** 2,
2405 'M': 1000 ** 2,
2406 'kk': 1000 ** 2,
2407 'KK': 1000 ** 2,
2408 'b': 1000 ** 3,
2409 'B': 1000 ** 3,
2410 }
2411
2412 ret = lookup_unit_table(_UNIT_TABLE, s)
2413 if ret is not None:
2414 return ret
2415
2416 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2417 if mobj:
2418 return str_to_int(mobj.group(1))
2419
2420
2421 def parse_resolution(s, *, lenient=False):
2422 if s is None:
2423 return {}
2424
2425 if lenient:
2426 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2427 else:
2428 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2429 if mobj:
2430 return {
2431 'width': int(mobj.group('w')),
2432 'height': int(mobj.group('h')),
2433 }
2434
2435 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2436 if mobj:
2437 return {'height': int(mobj.group(1))}
2438
2439 mobj = re.search(r'\b([48])[kK]\b', s)
2440 if mobj:
2441 return {'height': int(mobj.group(1)) * 540}
2442
2443 return {}
2444
2445
2446 def parse_bitrate(s):
2447 if not isinstance(s, str):
2448 return
2449 mobj = re.search(r'\b(\d+)\s*kbps', s)
2450 if mobj:
2451 return int(mobj.group(1))
2452
2453
2454 def month_by_name(name, lang='en'):
2455 """ Return the number of a month by (locale-independently) English name """
2456
2457 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2458
2459 try:
2460 return month_names.index(name) + 1
2461 except ValueError:
2462 return None
2463
2464
2465 def month_by_abbreviation(abbrev):
2466 """ Return the number of a month by (locale-independently) English
2467 abbreviations """
2468
2469 try:
2470 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2471 except ValueError:
2472 return None
2473
2474
2475 def fix_xml_ampersands(xml_str):
2476 """Replace all the '&' by '&amp;' in XML"""
2477 return re.sub(
2478 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2479 '&amp;',
2480 xml_str)
2481
2482
2483 def setproctitle(title):
2484 assert isinstance(title, str)
2485
2486 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2487 try:
2488 import ctypes
2489 except ImportError:
2490 return
2491
2492 try:
2493 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2494 except OSError:
2495 return
2496 except TypeError:
2497 # LoadLibrary in Windows Python 2.7.13 only expects
2498 # a bytestring, but since unicode_literals turns
2499 # every string into a unicode string, it fails.
2500 return
2501 title_bytes = title.encode()
2502 buf = ctypes.create_string_buffer(len(title_bytes))
2503 buf.value = title_bytes
2504 try:
2505 libc.prctl(15, buf, 0, 0, 0)
2506 except AttributeError:
2507 return # Strange libc, just skip this
2508
2509
2510 def remove_start(s, start):
2511 return s[len(start):] if s is not None and s.startswith(start) else s
2512
2513
2514 def remove_end(s, end):
2515 return s[:-len(end)] if s is not None and s.endswith(end) else s
2516
2517
2518 def remove_quotes(s):
2519 if s is None or len(s) < 2:
2520 return s
2521 for quote in ('"', "'", ):
2522 if s[0] == quote and s[-1] == quote:
2523 return s[1:-1]
2524 return s
2525
2526
2527 def get_domain(url):
2528 """
2529 This implementation is inconsistent, but is kept for compatibility.
2530 Use this only for "webpage_url_domain"
2531 """
2532 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
2533
2534
2535 def url_basename(url):
2536 path = urllib.parse.urlparse(url).path
2537 return path.strip('/').split('/')[-1]
2538
2539
2540 def base_url(url):
2541 return re.match(r'https?://[^?#]+/', url).group()
2542
2543
2544 def urljoin(base, path):
2545 if isinstance(path, bytes):
2546 path = path.decode()
2547 if not isinstance(path, str) or not path:
2548 return None
2549 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2550 return path
2551 if isinstance(base, bytes):
2552 base = base.decode()
2553 if not isinstance(base, str) or not re.match(
2554 r'^(?:https?:)?//', base):
2555 return None
2556 return urllib.parse.urljoin(base, path)
2557
2558
2559 class HEADRequest(urllib.request.Request):
2560 def get_method(self):
2561 return 'HEAD'
2562
2563
2564 class PUTRequest(urllib.request.Request):
2565 def get_method(self):
2566 return 'PUT'
2567
2568
2569 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2570 if get_attr and v is not None:
2571 v = getattr(v, get_attr, None)
2572 try:
2573 return int(v) * invscale // scale
2574 except (ValueError, TypeError, OverflowError):
2575 return default
2576
2577
2578 def str_or_none(v, default=None):
2579 return default if v is None else str(v)
2580
2581
2582 def str_to_int(int_str):
2583 """ A more relaxed version of int_or_none """
2584 if isinstance(int_str, int):
2585 return int_str
2586 elif isinstance(int_str, str):
2587 int_str = re.sub(r'[,\.\+]', '', int_str)
2588 return int_or_none(int_str)
2589
2590
2591 def float_or_none(v, scale=1, invscale=1, default=None):
2592 if v is None:
2593 return default
2594 try:
2595 return float(v) * invscale / scale
2596 except (ValueError, TypeError):
2597 return default
2598
2599
2600 def bool_or_none(v, default=None):
2601 return v if isinstance(v, bool) else default
2602
2603
2604 def strip_or_none(v, default=None):
2605 return v.strip() if isinstance(v, str) else default
2606
2607
2608 def url_or_none(url):
2609 if not url or not isinstance(url, str):
2610 return None
2611 url = url.strip()
2612 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2613
2614
2615 def request_to_url(req):
2616 if isinstance(req, urllib.request.Request):
2617 return req.get_full_url()
2618 else:
2619 return req
2620
2621
2622 def strftime_or_none(timestamp, date_format, default=None):
2623 datetime_object = None
2624 try:
2625 if isinstance(timestamp, (int, float)): # unix timestamp
2626 # Using naive datetime here can break timestamp() in Windows
2627 # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414
2628 datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc)
2629 elif isinstance(timestamp, str): # assume YYYYMMDD
2630 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2631 date_format = re.sub( # Support %s on windows
2632 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
2633 return datetime_object.strftime(date_format)
2634 except (ValueError, TypeError, AttributeError):
2635 return default
2636
2637
2638 def parse_duration(s):
2639 if not isinstance(s, str):
2640 return None
2641 s = s.strip()
2642 if not s:
2643 return None
2644
2645 days, hours, mins, secs, ms = [None] * 5
2646 m = re.match(r'''(?x)
2647 (?P<before_secs>
2648 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2649 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2650 (?P<ms>[.:][0-9]+)?Z?$
2651 ''', s)
2652 if m:
2653 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2654 else:
2655 m = re.match(
2656 r'''(?ix)(?:P?
2657 (?:
2658 [0-9]+\s*y(?:ears?)?,?\s*
2659 )?
2660 (?:
2661 [0-9]+\s*m(?:onths?)?,?\s*
2662 )?
2663 (?:
2664 [0-9]+\s*w(?:eeks?)?,?\s*
2665 )?
2666 (?:
2667 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2668 )?
2669 T)?
2670 (?:
2671 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2672 )?
2673 (?:
2674 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2675 )?
2676 (?:
2677 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2678 )?Z?$''', s)
2679 if m:
2680 days, hours, mins, secs, ms = m.groups()
2681 else:
2682 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2683 if m:
2684 hours, mins = m.groups()
2685 else:
2686 return None
2687
2688 if ms:
2689 ms = ms.replace(':', '.')
2690 return sum(float(part or 0) * mult for part, mult in (
2691 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2692
2693
2694 def prepend_extension(filename, ext, expected_real_ext=None):
2695 name, real_ext = os.path.splitext(filename)
2696 return (
2697 f'{name}.{ext}{real_ext}'
2698 if not expected_real_ext or real_ext[1:] == expected_real_ext
2699 else f'{filename}.{ext}')
2700
2701
2702 def replace_extension(filename, ext, expected_real_ext=None):
2703 name, real_ext = os.path.splitext(filename)
2704 return '{}.{}'.format(
2705 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2706 ext)
2707
2708
2709 def check_executable(exe, args=[]):
2710 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2711 args can be a list of arguments for a short output (like -version) """
2712 try:
2713 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
2714 except OSError:
2715 return False
2716 return exe
2717
2718
2719 def _get_exe_version_output(exe, args):
2720 try:
2721 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2722 # SIGTTOU if yt-dlp is run in the background.
2723 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2724 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2725 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
2726 except OSError:
2727 return False
2728 return stdout
2729
2730
2731 def detect_exe_version(output, version_re=None, unrecognized='present'):
2732 assert isinstance(output, str)
2733 if version_re is None:
2734 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2735 m = re.search(version_re, output)
2736 if m:
2737 return m.group(1)
2738 else:
2739 return unrecognized
2740
2741
2742 def get_exe_version(exe, args=['--version'],
2743 version_re=None, unrecognized='present'):
2744 """ Returns the version of the specified executable,
2745 or False if the executable is not present """
2746 out = _get_exe_version_output(exe, args)
2747 return detect_exe_version(out, version_re, unrecognized) if out else False
2748
2749
2750 def frange(start=0, stop=None, step=1):
2751 """Float range"""
2752 if stop is None:
2753 start, stop = 0, start
2754 sign = [-1, 1][step > 0] if step else 0
2755 while sign * start < sign * stop:
2756 yield start
2757 start += step
2758
2759
2760 class LazyList(collections.abc.Sequence):
2761 """Lazy immutable list from an iterable
2762 Note that slices of a LazyList are lists and not LazyList"""
2763
2764 class IndexError(IndexError):
2765 pass
2766
2767 def __init__(self, iterable, *, reverse=False, _cache=None):
2768 self._iterable = iter(iterable)
2769 self._cache = [] if _cache is None else _cache
2770 self._reversed = reverse
2771
2772 def __iter__(self):
2773 if self._reversed:
2774 # We need to consume the entire iterable to iterate in reverse
2775 yield from self.exhaust()
2776 return
2777 yield from self._cache
2778 for item in self._iterable:
2779 self._cache.append(item)
2780 yield item
2781
2782 def _exhaust(self):
2783 self._cache.extend(self._iterable)
2784 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2785 return self._cache
2786
2787 def exhaust(self):
2788 """Evaluate the entire iterable"""
2789 return self._exhaust()[::-1 if self._reversed else 1]
2790
2791 @staticmethod
2792 def _reverse_index(x):
2793 return None if x is None else ~x
2794
2795 def __getitem__(self, idx):
2796 if isinstance(idx, slice):
2797 if self._reversed:
2798 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
2799 start, stop, step = idx.start, idx.stop, idx.step or 1
2800 elif isinstance(idx, int):
2801 if self._reversed:
2802 idx = self._reverse_index(idx)
2803 start, stop, step = idx, idx, 0
2804 else:
2805 raise TypeError('indices must be integers or slices')
2806 if ((start or 0) < 0 or (stop or 0) < 0
2807 or (start is None and step < 0)
2808 or (stop is None and step > 0)):
2809 # We need to consume the entire iterable to be able to slice from the end
2810 # Obviously, never use this with infinite iterables
2811 self._exhaust()
2812 try:
2813 return self._cache[idx]
2814 except IndexError as e:
2815 raise self.IndexError(e) from e
2816 n = max(start or 0, stop or 0) - len(self._cache) + 1
2817 if n > 0:
2818 self._cache.extend(itertools.islice(self._iterable, n))
2819 try:
2820 return self._cache[idx]
2821 except IndexError as e:
2822 raise self.IndexError(e) from e
2823
2824 def __bool__(self):
2825 try:
2826 self[-1] if self._reversed else self[0]
2827 except self.IndexError:
2828 return False
2829 return True
2830
2831 def __len__(self):
2832 self._exhaust()
2833 return len(self._cache)
2834
2835 def __reversed__(self):
2836 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
2837
2838 def __copy__(self):
2839 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
2840
2841 def __repr__(self):
2842 # repr and str should mimic a list. So we exhaust the iterable
2843 return repr(self.exhaust())
2844
2845 def __str__(self):
2846 return repr(self.exhaust())
2847
2848
2849 class PagedList:
2850
2851 class IndexError(IndexError):
2852 pass
2853
2854 def __len__(self):
2855 # This is only useful for tests
2856 return len(self.getslice())
2857
2858 def __init__(self, pagefunc, pagesize, use_cache=True):
2859 self._pagefunc = pagefunc
2860 self._pagesize = pagesize
2861 self._pagecount = float('inf')
2862 self._use_cache = use_cache
2863 self._cache = {}
2864
2865 def getpage(self, pagenum):
2866 page_results = self._cache.get(pagenum)
2867 if page_results is None:
2868 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2869 if self._use_cache:
2870 self._cache[pagenum] = page_results
2871 return page_results
2872
2873 def getslice(self, start=0, end=None):
2874 return list(self._getslice(start, end))
2875
2876 def _getslice(self, start, end):
2877 raise NotImplementedError('This method must be implemented by subclasses')
2878
2879 def __getitem__(self, idx):
2880 assert self._use_cache, 'Indexing PagedList requires cache'
2881 if not isinstance(idx, int) or idx < 0:
2882 raise TypeError('indices must be non-negative integers')
2883 entries = self.getslice(idx, idx + 1)
2884 if not entries:
2885 raise self.IndexError()
2886 return entries[0]
2887
2888
2889 class OnDemandPagedList(PagedList):
2890 """Download pages until a page with less than maximum results"""
2891
2892 def _getslice(self, start, end):
2893 for pagenum in itertools.count(start // self._pagesize):
2894 firstid = pagenum * self._pagesize
2895 nextfirstid = pagenum * self._pagesize + self._pagesize
2896 if start >= nextfirstid:
2897 continue
2898
2899 startv = (
2900 start % self._pagesize
2901 if firstid <= start < nextfirstid
2902 else 0)
2903 endv = (
2904 ((end - 1) % self._pagesize) + 1
2905 if (end is not None and firstid <= end <= nextfirstid)
2906 else None)
2907
2908 try:
2909 page_results = self.getpage(pagenum)
2910 except Exception:
2911 self._pagecount = pagenum - 1
2912 raise
2913 if startv != 0 or endv is not None:
2914 page_results = page_results[startv:endv]
2915 yield from page_results
2916
2917 # A little optimization - if current page is not "full", ie. does
2918 # not contain page_size videos then we can assume that this page
2919 # is the last one - there are no more ids on further pages -
2920 # i.e. no need to query again.
2921 if len(page_results) + startv < self._pagesize:
2922 break
2923
2924 # If we got the whole page, but the next page is not interesting,
2925 # break out early as well
2926 if end == nextfirstid:
2927 break
2928
2929
2930 class InAdvancePagedList(PagedList):
2931 """PagedList with total number of pages known in advance"""
2932
2933 def __init__(self, pagefunc, pagecount, pagesize):
2934 PagedList.__init__(self, pagefunc, pagesize, True)
2935 self._pagecount = pagecount
2936
2937 def _getslice(self, start, end):
2938 start_page = start // self._pagesize
2939 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2940 skip_elems = start - start_page * self._pagesize
2941 only_more = None if end is None else end - start
2942 for pagenum in range(start_page, end_page):
2943 page_results = self.getpage(pagenum)
2944 if skip_elems:
2945 page_results = page_results[skip_elems:]
2946 skip_elems = None
2947 if only_more is not None:
2948 if len(page_results) < only_more:
2949 only_more -= len(page_results)
2950 else:
2951 yield from page_results[:only_more]
2952 break
2953 yield from page_results
2954
2955
2956 class PlaylistEntries:
2957 MissingEntry = object()
2958 is_exhausted = False
2959
2960 def __init__(self, ydl, info_dict):
2961 self.ydl = ydl
2962
2963 # _entries must be assigned now since infodict can change during iteration
2964 entries = info_dict.get('entries')
2965 if entries is None:
2966 raise EntryNotInPlaylist('There are no entries')
2967 elif isinstance(entries, list):
2968 self.is_exhausted = True
2969
2970 requested_entries = info_dict.get('requested_entries')
2971 self.is_incomplete = requested_entries is not None
2972 if self.is_incomplete:
2973 assert self.is_exhausted
2974 self._entries = [self.MissingEntry] * max(requested_entries or [0])
2975 for i, entry in zip(requested_entries, entries):
2976 self._entries[i - 1] = entry
2977 elif isinstance(entries, (list, PagedList, LazyList)):
2978 self._entries = entries
2979 else:
2980 self._entries = LazyList(entries)
2981
2982 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2983 (?P<start>[+-]?\d+)?
2984 (?P<range>[:-]
2985 (?P<end>[+-]?\d+|inf(?:inite)?)?
2986 (?::(?P<step>[+-]?\d+))?
2987 )?''')
2988
2989 @classmethod
2990 def parse_playlist_items(cls, string):
2991 for segment in string.split(','):
2992 if not segment:
2993 raise ValueError('There is two or more consecutive commas')
2994 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2995 if not mobj:
2996 raise ValueError(f'{segment!r} is not a valid specification')
2997 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2998 if int_or_none(step) == 0:
2999 raise ValueError(f'Step in {segment!r} cannot be zero')
3000 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
3001
3002 def get_requested_items(self):
3003 playlist_items = self.ydl.params.get('playlist_items')
3004 playlist_start = self.ydl.params.get('playliststart', 1)
3005 playlist_end = self.ydl.params.get('playlistend')
3006 # For backwards compatibility, interpret -1 as whole list
3007 if playlist_end in (-1, None):
3008 playlist_end = ''
3009 if not playlist_items:
3010 playlist_items = f'{playlist_start}:{playlist_end}'
3011 elif playlist_start != 1 or playlist_end:
3012 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
3013
3014 for index in self.parse_playlist_items(playlist_items):
3015 for i, entry in self[index]:
3016 yield i, entry
3017 if not entry:
3018 continue
3019 try:
3020 # TODO: Add auto-generated fields
3021 self.ydl._match_entry(entry, incomplete=True, silent=True)
3022 except (ExistingVideoReached, RejectedVideoReached):
3023 return
3024
3025 def get_full_count(self):
3026 if self.is_exhausted and not self.is_incomplete:
3027 return len(self)
3028 elif isinstance(self._entries, InAdvancePagedList):
3029 if self._entries._pagesize == 1:
3030 return self._entries._pagecount
3031
3032 @functools.cached_property
3033 def _getter(self):
3034 if isinstance(self._entries, list):
3035 def get_entry(i):
3036 try:
3037 entry = self._entries[i]
3038 except IndexError:
3039 entry = self.MissingEntry
3040 if not self.is_incomplete:
3041 raise self.IndexError()
3042 if entry is self.MissingEntry:
3043 raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found')
3044 return entry
3045 else:
3046 def get_entry(i):
3047 try:
3048 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
3049 except (LazyList.IndexError, PagedList.IndexError):
3050 raise self.IndexError()
3051 return get_entry
3052
3053 def __getitem__(self, idx):
3054 if isinstance(idx, int):
3055 idx = slice(idx, idx)
3056
3057 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3058 step = 1 if idx.step is None else idx.step
3059 if idx.start is None:
3060 start = 0 if step > 0 else len(self) - 1
3061 else:
3062 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3063
3064 # NB: Do not call len(self) when idx == [:]
3065 if idx.stop is None:
3066 stop = 0 if step < 0 else float('inf')
3067 else:
3068 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3069 stop += [-1, 1][step > 0]
3070
3071 for i in frange(start, stop, step):
3072 if i < 0:
3073 continue
3074 try:
3075 entry = self._getter(i)
3076 except self.IndexError:
3077 self.is_exhausted = True
3078 if step > 0:
3079 break
3080 continue
3081 yield i + 1, entry
3082
3083 def __len__(self):
3084 return len(tuple(self[:]))
3085
3086 class IndexError(IndexError):
3087 pass
3088
3089
3090 def uppercase_escape(s):
3091 unicode_escape = codecs.getdecoder('unicode_escape')
3092 return re.sub(
3093 r'\\U[0-9a-fA-F]{8}',
3094 lambda m: unicode_escape(m.group(0))[0],
3095 s)
3096
3097
3098 def lowercase_escape(s):
3099 unicode_escape = codecs.getdecoder('unicode_escape')
3100 return re.sub(
3101 r'\\u[0-9a-fA-F]{4}',
3102 lambda m: unicode_escape(m.group(0))[0],
3103 s)
3104
3105
3106 def escape_rfc3986(s):
3107 """Escape non-ASCII characters as suggested by RFC 3986"""
3108 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3109
3110
3111 def escape_url(url):
3112 """Escape URL as suggested by RFC 3986"""
3113 url_parsed = urllib.parse.urlparse(url)
3114 return url_parsed._replace(
3115 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3116 path=escape_rfc3986(url_parsed.path),
3117 params=escape_rfc3986(url_parsed.params),
3118 query=escape_rfc3986(url_parsed.query),
3119 fragment=escape_rfc3986(url_parsed.fragment)
3120 ).geturl()
3121
3122
3123 def parse_qs(url, **kwargs):
3124 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
3125
3126
3127 def read_batch_urls(batch_fd):
3128 def fixup(url):
3129 if not isinstance(url, str):
3130 url = url.decode('utf-8', 'replace')
3131 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3132 for bom in BOM_UTF8:
3133 if url.startswith(bom):
3134 url = url[len(bom):]
3135 url = url.lstrip()
3136 if not url or url.startswith(('#', ';', ']')):
3137 return False
3138 # "#" cannot be stripped out since it is part of the URI
3139 # However, it can be safely stripped out if following a whitespace
3140 return re.split(r'\s#', url, 1)[0].rstrip()
3141
3142 with contextlib.closing(batch_fd) as fd:
3143 return [url for url in map(fixup, fd) if url]
3144
3145
3146 def urlencode_postdata(*args, **kargs):
3147 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
3148
3149
3150 def update_url_query(url, query):
3151 if not query:
3152 return url
3153 parsed_url = urllib.parse.urlparse(url)
3154 qs = urllib.parse.parse_qs(parsed_url.query)
3155 qs.update(query)
3156 return urllib.parse.urlunparse(parsed_url._replace(
3157 query=urllib.parse.urlencode(qs, True)))
3158
3159
3160 def update_Request(req, url=None, data=None, headers=None, query=None):
3161 req_headers = req.headers.copy()
3162 req_headers.update(headers or {})
3163 req_data = data or req.data
3164 req_url = update_url_query(url or req.get_full_url(), query)
3165 req_get_method = req.get_method()
3166 if req_get_method == 'HEAD':
3167 req_type = HEADRequest
3168 elif req_get_method == 'PUT':
3169 req_type = PUTRequest
3170 else:
3171 req_type = urllib.request.Request
3172 new_req = req_type(
3173 req_url, data=req_data, headers=req_headers,
3174 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3175 if hasattr(req, 'timeout'):
3176 new_req.timeout = req.timeout
3177 return new_req
3178
3179
3180 def _multipart_encode_impl(data, boundary):
3181 content_type = 'multipart/form-data; boundary=%s' % boundary
3182
3183 out = b''
3184 for k, v in data.items():
3185 out += b'--' + boundary.encode('ascii') + b'\r\n'
3186 if isinstance(k, str):
3187 k = k.encode()
3188 if isinstance(v, str):
3189 v = v.encode()
3190 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3191 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3192 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3193 if boundary.encode('ascii') in content:
3194 raise ValueError('Boundary overlaps with data')
3195 out += content
3196
3197 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3198
3199 return out, content_type
3200
3201
3202 def multipart_encode(data, boundary=None):
3203 '''
3204 Encode a dict to RFC 7578-compliant form-data
3205
3206 data:
3207 A dict where keys and values can be either Unicode or bytes-like
3208 objects.
3209 boundary:
3210 If specified a Unicode object, it's used as the boundary. Otherwise
3211 a random boundary is generated.
3212
3213 Reference: https://tools.ietf.org/html/rfc7578
3214 '''
3215 has_specified_boundary = boundary is not None
3216
3217 while True:
3218 if boundary is None:
3219 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3220
3221 try:
3222 out, content_type = _multipart_encode_impl(data, boundary)
3223 break
3224 except ValueError:
3225 if has_specified_boundary:
3226 raise
3227 boundary = None
3228
3229 return out, content_type
3230
3231
3232 def variadic(x, allowed_types=(str, bytes, dict)):
3233 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
3234
3235
3236 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3237 for val in map(d.get, variadic(key_or_keys)):
3238 if val is not None and (val or not skip_false_values):
3239 return val
3240 return default
3241
3242
3243 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3244 for f in funcs:
3245 try:
3246 val = f(*args, **kwargs)
3247 except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError):
3248 pass
3249 else:
3250 if expected_type is None or isinstance(val, expected_type):
3251 return val
3252
3253
3254 def try_get(src, getter, expected_type=None):
3255 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
3256
3257
3258 def filter_dict(dct, cndn=lambda _, v: v is not None):
3259 return {k: v for k, v in dct.items() if cndn(k, v)}
3260
3261
3262 def merge_dicts(*dicts):
3263 merged = {}
3264 for a_dict in dicts:
3265 for k, v in a_dict.items():
3266 if (v is not None and k not in merged
3267 or isinstance(v, str) and merged[k] == ''):
3268 merged[k] = v
3269 return merged
3270
3271
3272 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3273 return string if isinstance(string, str) else str(string, encoding, errors)
3274
3275
3276 US_RATINGS = {
3277 'G': 0,
3278 'PG': 10,
3279 'PG-13': 13,
3280 'R': 16,
3281 'NC': 18,
3282 }
3283
3284
3285 TV_PARENTAL_GUIDELINES = {
3286 'TV-Y': 0,
3287 'TV-Y7': 7,
3288 'TV-G': 0,
3289 'TV-PG': 0,
3290 'TV-14': 14,
3291 'TV-MA': 17,
3292 }
3293
3294
3295 def parse_age_limit(s):
3296 # isinstance(False, int) is True. So type() must be used instead
3297 if type(s) is int: # noqa: E721
3298 return s if 0 <= s <= 21 else None
3299 elif not isinstance(s, str):
3300 return None
3301 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3302 if m:
3303 return int(m.group('age'))
3304 s = s.upper()
3305 if s in US_RATINGS:
3306 return US_RATINGS[s]
3307 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3308 if m:
3309 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3310 return None
3311
3312
3313 def strip_jsonp(code):
3314 return re.sub(
3315 r'''(?sx)^
3316 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3317 (?:\s*&&\s*(?P=func_name))?
3318 \s*\(\s*(?P<callback_data>.*)\);?
3319 \s*?(?://[^\n]*)*$''',
3320 r'\g<callback_data>', code)
3321
3322
3323 def js_to_json(code, vars={}, *, strict=False):
3324 # vars is a dict of var, val pairs to substitute
3325 STRING_QUOTES = '\'"'
3326 STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES)
3327 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3328 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
3329 INTEGER_TABLE = (
3330 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3331 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
3332 )
3333
3334 def process_escape(match):
3335 JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
3336 escape = match.group(1) or match.group(2)
3337
3338 return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES
3339 else R'\u00' if escape == 'x'
3340 else '' if escape == '\n'
3341 else escape)
3342
3343 def fix_kv(m):
3344 v = m.group(0)
3345 if v in ('true', 'false', 'null'):
3346 return v
3347 elif v in ('undefined', 'void 0'):
3348 return 'null'
3349 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3350 return ''
3351
3352 if v[0] in STRING_QUOTES:
3353 escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1])
3354 return f'"{escaped}"'
3355
3356 for regex, base in INTEGER_TABLE:
3357 im = re.match(regex, v)
3358 if im:
3359 i = int(im.group(1), base)
3360 return f'"{i}":' if v.endswith(':') else str(i)
3361
3362 if v in vars:
3363 return json.dumps(vars[v])
3364
3365 if not strict:
3366 return f'"{v}"'
3367
3368 raise ValueError(f'Unknown value: {v}')
3369
3370 def create_map(mobj):
3371 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3372
3373 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
3374 if not strict:
3375 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3376 code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code)
3377
3378 return re.sub(rf'''(?sx)
3379 {STRING_RE}|
3380 {COMMENT_RE}|,(?={SKIP_RE}[\]}}])|
3381 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3382 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?|
3383 [0-9]+(?={SKIP_RE}:)|
3384 !+
3385 ''', fix_kv, code)
3386
3387
3388 def qualities(quality_ids):
3389 """ Get a numeric quality value out of a list of possible values """
3390 def q(qid):
3391 try:
3392 return quality_ids.index(qid)
3393 except ValueError:
3394 return -1
3395 return q
3396
3397
3398 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
3399
3400
3401 DEFAULT_OUTTMPL = {
3402 'default': '%(title)s [%(id)s].%(ext)s',
3403 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3404 }
3405 OUTTMPL_TYPES = {
3406 'chapter': None,
3407 'subtitle': None,
3408 'thumbnail': None,
3409 'description': 'description',
3410 'annotation': 'annotations.xml',
3411 'infojson': 'info.json',
3412 'link': None,
3413 'pl_video': None,
3414 'pl_thumbnail': None,
3415 'pl_description': 'description',
3416 'pl_infojson': 'info.json',
3417 }
3418
3419 # As of [1] format syntax is:
3420 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3421 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3422 STR_FORMAT_RE_TMPL = r'''(?x)
3423 (?<!%)(?P<prefix>(?:%%)*)
3424 %
3425 (?P<has_key>\((?P<key>{0})\))?
3426 (?P<format>
3427 (?P<conversion>[#0\-+ ]+)?
3428 (?P<min_width>\d+)?
3429 (?P<precision>\.\d+)?
3430 (?P<len_mod>[hlL])? # unused in python
3431 {1} # conversion type
3432 )
3433 '''
3434
3435
3436 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3437
3438
3439 def limit_length(s, length):
3440 """ Add ellipses to overly long strings """
3441 if s is None:
3442 return None
3443 ELLIPSES = '...'
3444 if len(s) > length:
3445 return s[:length - len(ELLIPSES)] + ELLIPSES
3446 return s
3447
3448
3449 def version_tuple(v):
3450 return tuple(int(e) for e in re.split(r'[-.]', v))
3451
3452
3453 def is_outdated_version(version, limit, assume_new=True):
3454 if not version:
3455 return not assume_new
3456 try:
3457 return version_tuple(version) < version_tuple(limit)
3458 except ValueError:
3459 return not assume_new
3460
3461
3462 def ytdl_is_updateable():
3463 """ Returns if yt-dlp can be updated with -U """
3464
3465 from .update import is_non_updateable
3466
3467 return not is_non_updateable()
3468
3469
3470 def args_to_str(args):
3471 # Get a short string representation for a subprocess command
3472 return ' '.join(compat_shlex_quote(a) for a in args)
3473
3474
3475 def error_to_compat_str(err):
3476 return str(err)
3477
3478
3479 def error_to_str(err):
3480 return f'{type(err).__name__}: {err}'
3481
3482
3483 def mimetype2ext(mt):
3484 if mt is None:
3485 return None
3486
3487 mt, _, params = mt.partition(';')
3488 mt = mt.strip()
3489
3490 FULL_MAP = {
3491 'audio/mp4': 'm4a',
3492 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3493 # it's the most popular one
3494 'audio/mpeg': 'mp3',
3495 'audio/x-wav': 'wav',
3496 'audio/wav': 'wav',
3497 'audio/wave': 'wav',
3498 }
3499
3500 ext = FULL_MAP.get(mt)
3501 if ext is not None:
3502 return ext
3503
3504 SUBTYPE_MAP = {
3505 '3gpp': '3gp',
3506 'smptett+xml': 'tt',
3507 'ttaf+xml': 'dfxp',
3508 'ttml+xml': 'ttml',
3509 'x-flv': 'flv',
3510 'x-mp4-fragmented': 'mp4',
3511 'x-ms-sami': 'sami',
3512 'x-ms-wmv': 'wmv',
3513 'mpegurl': 'm3u8',
3514 'x-mpegurl': 'm3u8',
3515 'vnd.apple.mpegurl': 'm3u8',
3516 'dash+xml': 'mpd',
3517 'f4m+xml': 'f4m',
3518 'hds+xml': 'f4m',
3519 'vnd.ms-sstr+xml': 'ism',
3520 'quicktime': 'mov',
3521 'mp2t': 'ts',
3522 'x-wav': 'wav',
3523 'filmstrip+json': 'fs',
3524 'svg+xml': 'svg',
3525 }
3526
3527 _, _, subtype = mt.rpartition('/')
3528 ext = SUBTYPE_MAP.get(subtype.lower())
3529 if ext is not None:
3530 return ext
3531
3532 SUFFIX_MAP = {
3533 'json': 'json',
3534 'xml': 'xml',
3535 'zip': 'zip',
3536 'gzip': 'gz',
3537 }
3538
3539 _, _, suffix = subtype.partition('+')
3540 ext = SUFFIX_MAP.get(suffix)
3541 if ext is not None:
3542 return ext
3543
3544 return subtype.replace('+', '.')
3545
3546
3547 def ext2mimetype(ext_or_url):
3548 if not ext_or_url:
3549 return None
3550 if '.' not in ext_or_url:
3551 ext_or_url = f'file.{ext_or_url}'
3552 return mimetypes.guess_type(ext_or_url)[0]
3553
3554
3555 def parse_codecs(codecs_str):
3556 # http://tools.ietf.org/html/rfc6381
3557 if not codecs_str:
3558 return {}
3559 split_codecs = list(filter(None, map(
3560 str.strip, codecs_str.strip().strip(',').split(','))))
3561 vcodec, acodec, scodec, hdr = None, None, None, None
3562 for full_codec in split_codecs:
3563 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3564 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3565 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3566 if vcodec:
3567 continue
3568 vcodec = full_codec
3569 if parts[0] in ('dvh1', 'dvhe'):
3570 hdr = 'DV'
3571 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3572 hdr = 'HDR10'
3573 elif parts[:2] == ['vp9', '2']:
3574 hdr = 'HDR10'
3575 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3576 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3577 acodec = acodec or full_codec
3578 elif parts[0] in ('stpp', 'wvtt'):
3579 scodec = scodec or full_codec
3580 else:
3581 write_string(f'WARNING: Unknown codec {full_codec}\n')
3582 if vcodec or acodec or scodec:
3583 return {
3584 'vcodec': vcodec or 'none',
3585 'acodec': acodec or 'none',
3586 'dynamic_range': hdr,
3587 **({'scodec': scodec} if scodec is not None else {}),
3588 }
3589 elif len(split_codecs) == 2:
3590 return {
3591 'vcodec': split_codecs[0],
3592 'acodec': split_codecs[1],
3593 }
3594 return {}
3595
3596
3597 def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3598 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3599
3600 allow_mkv = not preferences or 'mkv' in preferences
3601
3602 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3603 return 'mkv' # TODO: any other format allows this?
3604
3605 # TODO: All codecs supported by parse_codecs isn't handled here
3606 COMPATIBLE_CODECS = {
3607 'mp4': {
3608 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
3609 'h264', 'aacl', 'ec-3', # Set in ISM
3610 },
3611 'webm': {
3612 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3613 'vp9x', 'vp8x', # in the webm spec
3614 },
3615 }
3616
3617 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3618 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
3619
3620 for ext in preferences or COMPATIBLE_CODECS.keys():
3621 codec_set = COMPATIBLE_CODECS.get(ext, set())
3622 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3623 return ext
3624
3625 COMPATIBLE_EXTS = (
3626 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3627 {'webm'},
3628 )
3629 for ext in preferences or vexts:
3630 current_exts = {ext, *vexts, *aexts}
3631 if ext == 'mkv' or current_exts == {ext} or any(
3632 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3633 return ext
3634 return 'mkv' if allow_mkv else preferences[-1]
3635
3636
3637 def urlhandle_detect_ext(url_handle):
3638 getheader = url_handle.headers.get
3639
3640 cd = getheader('Content-Disposition')
3641 if cd:
3642 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3643 if m:
3644 e = determine_ext(m.group('filename'), default_ext=None)
3645 if e:
3646 return e
3647
3648 return mimetype2ext(getheader('Content-Type'))
3649
3650
3651 def encode_data_uri(data, mime_type):
3652 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3653
3654
3655 def age_restricted(content_limit, age_limit):
3656 """ Returns True iff the content should be blocked """
3657
3658 if age_limit is None: # No limit set
3659 return False
3660 if content_limit is None:
3661 return False # Content available for everyone
3662 return age_limit < content_limit
3663
3664
3665 # List of known byte-order-marks (BOM)
3666 BOMS = [
3667 (b'\xef\xbb\xbf', 'utf-8'),
3668 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3669 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3670 (b'\xff\xfe', 'utf-16-le'),
3671 (b'\xfe\xff', 'utf-16-be'),
3672 ]
3673
3674
3675 def is_html(first_bytes):
3676 """ Detect whether a file contains HTML by examining its first bytes. """
3677
3678 encoding = 'utf-8'
3679 for bom, enc in BOMS:
3680 while first_bytes.startswith(bom):
3681 encoding, first_bytes = enc, first_bytes[len(bom):]
3682
3683 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
3684
3685
3686 def determine_protocol(info_dict):
3687 protocol = info_dict.get('protocol')
3688 if protocol is not None:
3689 return protocol
3690
3691 url = sanitize_url(info_dict['url'])
3692 if url.startswith('rtmp'):
3693 return 'rtmp'
3694 elif url.startswith('mms'):
3695 return 'mms'
3696 elif url.startswith('rtsp'):
3697 return 'rtsp'
3698
3699 ext = determine_ext(url)
3700 if ext == 'm3u8':
3701 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
3702 elif ext == 'f4m':
3703 return 'f4m'
3704
3705 return urllib.parse.urlparse(url).scheme
3706
3707
3708 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3709 """ Render a list of rows, each as a list of values.
3710 Text after a \t will be right aligned """
3711 def width(string):
3712 return len(remove_terminal_sequences(string).replace('\t', ''))
3713
3714 def get_max_lens(table):
3715 return [max(width(str(v)) for v in col) for col in zip(*table)]
3716
3717 def filter_using_list(row, filterArray):
3718 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3719
3720 max_lens = get_max_lens(data) if hide_empty else []
3721 header_row = filter_using_list(header_row, max_lens)
3722 data = [filter_using_list(row, max_lens) for row in data]
3723
3724 table = [header_row] + data
3725 max_lens = get_max_lens(table)
3726 extra_gap += 1
3727 if delim:
3728 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3729 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3730 for row in table:
3731 for pos, text in enumerate(map(str, row)):
3732 if '\t' in text:
3733 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3734 else:
3735 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3736 ret = '\n'.join(''.join(row).rstrip() for row in table)
3737 return ret
3738
3739
3740 def _match_one(filter_part, dct, incomplete):
3741 # TODO: Generalize code with YoutubeDL._build_format_filter
3742 STRING_OPERATORS = {
3743 '*=': operator.contains,
3744 '^=': lambda attr, value: attr.startswith(value),
3745 '$=': lambda attr, value: attr.endswith(value),
3746 '~=': lambda attr, value: re.search(value, attr),
3747 }
3748 COMPARISON_OPERATORS = {
3749 **STRING_OPERATORS,
3750 '<=': operator.le, # "<=" must be defined above "<"
3751 '<': operator.lt,
3752 '>=': operator.ge,
3753 '>': operator.gt,
3754 '=': operator.eq,
3755 }
3756
3757 if isinstance(incomplete, bool):
3758 is_incomplete = lambda _: incomplete
3759 else:
3760 is_incomplete = lambda k: k in incomplete
3761
3762 operator_rex = re.compile(r'''(?x)
3763 (?P<key>[a-z_]+)
3764 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3765 (?:
3766 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3767 (?P<strval>.+?)
3768 )
3769 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3770 m = operator_rex.fullmatch(filter_part.strip())
3771 if m:
3772 m = m.groupdict()
3773 unnegated_op = COMPARISON_OPERATORS[m['op']]
3774 if m['negation']:
3775 op = lambda attr, value: not unnegated_op(attr, value)
3776 else:
3777 op = unnegated_op
3778 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3779 if m['quote']:
3780 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3781 actual_value = dct.get(m['key'])
3782 numeric_comparison = None
3783 if isinstance(actual_value, (int, float)):
3784 # If the original field is a string and matching comparisonvalue is
3785 # a number we should respect the origin of the original field
3786 # and process comparison value as a string (see
3787 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3788 try:
3789 numeric_comparison = int(comparison_value)
3790 except ValueError:
3791 numeric_comparison = parse_filesize(comparison_value)
3792 if numeric_comparison is None:
3793 numeric_comparison = parse_filesize(f'{comparison_value}B')
3794 if numeric_comparison is None:
3795 numeric_comparison = parse_duration(comparison_value)
3796 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3797 raise ValueError('Operator %s only supports string values!' % m['op'])
3798 if actual_value is None:
3799 return is_incomplete(m['key']) or m['none_inclusive']
3800 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3801
3802 UNARY_OPERATORS = {
3803 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3804 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3805 }
3806 operator_rex = re.compile(r'''(?x)
3807 (?P<op>%s)\s*(?P<key>[a-z_]+)
3808 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3809 m = operator_rex.fullmatch(filter_part.strip())
3810 if m:
3811 op = UNARY_OPERATORS[m.group('op')]
3812 actual_value = dct.get(m.group('key'))
3813 if is_incomplete(m.group('key')) and actual_value is None:
3814 return True
3815 return op(actual_value)
3816
3817 raise ValueError('Invalid filter part %r' % filter_part)
3818
3819
3820 def match_str(filter_str, dct, incomplete=False):
3821 """ Filter a dictionary with a simple string syntax.
3822 @returns Whether the filter passes
3823 @param incomplete Set of keys that is expected to be missing from dct.
3824 Can be True/False to indicate all/none of the keys may be missing.
3825 All conditions on incomplete keys pass if the key is missing
3826 """
3827 return all(
3828 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3829 for filter_part in re.split(r'(?<!\\)&', filter_str))
3830
3831
3832 def match_filter_func(filters):
3833 if not filters:
3834 return None
3835 filters = set(variadic(filters))
3836
3837 interactive = '-' in filters
3838 if interactive:
3839 filters.remove('-')
3840
3841 def _match_func(info_dict, incomplete=False):
3842 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3843 return NO_DEFAULT if interactive and not incomplete else None
3844 else:
3845 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
3846 filter_str = ') | ('.join(map(str.strip, filters))
3847 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3848 return _match_func
3849
3850
3851 class download_range_func:
3852 def __init__(self, chapters, ranges):
3853 self.chapters, self.ranges = chapters, ranges
3854
3855 def __call__(self, info_dict, ydl):
3856 if not self.ranges and not self.chapters:
3857 yield {}
3858
3859 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
3860 else 'Cannot match chapters since chapter information is unavailable')
3861 for regex in self.chapters or []:
3862 for i, chapter in enumerate(info_dict.get('chapters') or []):
3863 if re.search(regex, chapter['title']):
3864 warning = None
3865 yield {**chapter, 'index': i}
3866 if self.chapters and warning:
3867 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3868
3869 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
3870
3871 def __eq__(self, other):
3872 return (isinstance(other, download_range_func)
3873 and self.chapters == other.chapters and self.ranges == other.ranges)
3874
3875
3876 def parse_dfxp_time_expr(time_expr):
3877 if not time_expr:
3878 return
3879
3880 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
3881 if mobj:
3882 return float(mobj.group('time_offset'))
3883
3884 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3885 if mobj:
3886 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3887
3888
3889 def srt_subtitles_timecode(seconds):
3890 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3891
3892
3893 def ass_subtitles_timecode(seconds):
3894 time = timetuple_from_msec(seconds * 1000)
3895 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3896
3897
3898 def dfxp2srt(dfxp_data):
3899 '''
3900 @param dfxp_data A bytes-like object containing DFXP data
3901 @returns A unicode object containing converted SRT data
3902 '''
3903 LEGACY_NAMESPACES = (
3904 (b'http://www.w3.org/ns/ttml', [
3905 b'http://www.w3.org/2004/11/ttaf1',
3906 b'http://www.w3.org/2006/04/ttaf1',
3907 b'http://www.w3.org/2006/10/ttaf1',
3908 ]),
3909 (b'http://www.w3.org/ns/ttml#styling', [
3910 b'http://www.w3.org/ns/ttml#style',
3911 ]),
3912 )
3913
3914 SUPPORTED_STYLING = [
3915 'color',
3916 'fontFamily',
3917 'fontSize',
3918 'fontStyle',
3919 'fontWeight',
3920 'textDecoration'
3921 ]
3922
3923 _x = functools.partial(xpath_with_ns, ns_map={
3924 'xml': 'http://www.w3.org/XML/1998/namespace',
3925 'ttml': 'http://www.w3.org/ns/ttml',
3926 'tts': 'http://www.w3.org/ns/ttml#styling',
3927 })
3928
3929 styles = {}
3930 default_style = {}
3931
3932 class TTMLPElementParser:
3933 _out = ''
3934 _unclosed_elements = []
3935 _applied_styles = []
3936
3937 def start(self, tag, attrib):
3938 if tag in (_x('ttml:br'), 'br'):
3939 self._out += '\n'
3940 else:
3941 unclosed_elements = []
3942 style = {}
3943 element_style_id = attrib.get('style')
3944 if default_style:
3945 style.update(default_style)
3946 if element_style_id:
3947 style.update(styles.get(element_style_id, {}))
3948 for prop in SUPPORTED_STYLING:
3949 prop_val = attrib.get(_x('tts:' + prop))
3950 if prop_val:
3951 style[prop] = prop_val
3952 if style:
3953 font = ''
3954 for k, v in sorted(style.items()):
3955 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3956 continue
3957 if k == 'color':
3958 font += ' color="%s"' % v
3959 elif k == 'fontSize':
3960 font += ' size="%s"' % v
3961 elif k == 'fontFamily':
3962 font += ' face="%s"' % v
3963 elif k == 'fontWeight' and v == 'bold':
3964 self._out += '<b>'
3965 unclosed_elements.append('b')
3966 elif k == 'fontStyle' and v == 'italic':
3967 self._out += '<i>'
3968 unclosed_elements.append('i')
3969 elif k == 'textDecoration' and v == 'underline':
3970 self._out += '<u>'
3971 unclosed_elements.append('u')
3972 if font:
3973 self._out += '<font' + font + '>'
3974 unclosed_elements.append('font')
3975 applied_style = {}
3976 if self._applied_styles:
3977 applied_style.update(self._applied_styles[-1])
3978 applied_style.update(style)
3979 self._applied_styles.append(applied_style)
3980 self._unclosed_elements.append(unclosed_elements)
3981
3982 def end(self, tag):
3983 if tag not in (_x('ttml:br'), 'br'):
3984 unclosed_elements = self._unclosed_elements.pop()
3985 for element in reversed(unclosed_elements):
3986 self._out += '</%s>' % element
3987 if unclosed_elements and self._applied_styles:
3988 self._applied_styles.pop()
3989
3990 def data(self, data):
3991 self._out += data
3992
3993 def close(self):
3994 return self._out.strip()
3995
3996 def parse_node(node):
3997 target = TTMLPElementParser()
3998 parser = xml.etree.ElementTree.XMLParser(target=target)
3999 parser.feed(xml.etree.ElementTree.tostring(node))
4000 return parser.close()
4001
4002 for k, v in LEGACY_NAMESPACES:
4003 for ns in v:
4004 dfxp_data = dfxp_data.replace(ns, k)
4005
4006 dfxp = compat_etree_fromstring(dfxp_data)
4007 out = []
4008 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4009
4010 if not paras:
4011 raise ValueError('Invalid dfxp/TTML subtitle')
4012
4013 repeat = False
4014 while True:
4015 for style in dfxp.findall(_x('.//ttml:style')):
4016 style_id = style.get('id') or style.get(_x('xml:id'))
4017 if not style_id:
4018 continue
4019 parent_style_id = style.get('style')
4020 if parent_style_id:
4021 if parent_style_id not in styles:
4022 repeat = True
4023 continue
4024 styles[style_id] = styles[parent_style_id].copy()
4025 for prop in SUPPORTED_STYLING:
4026 prop_val = style.get(_x('tts:' + prop))
4027 if prop_val:
4028 styles.setdefault(style_id, {})[prop] = prop_val
4029 if repeat:
4030 repeat = False
4031 else:
4032 break
4033
4034 for p in ('body', 'div'):
4035 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4036 if ele is None:
4037 continue
4038 style = styles.get(ele.get('style'))
4039 if not style:
4040 continue
4041 default_style.update(style)
4042
4043 for para, index in zip(paras, itertools.count(1)):
4044 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4045 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4046 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4047 if begin_time is None:
4048 continue
4049 if not end_time:
4050 if not dur:
4051 continue
4052 end_time = begin_time + dur
4053 out.append('%d\n%s --> %s\n%s\n\n' % (
4054 index,
4055 srt_subtitles_timecode(begin_time),
4056 srt_subtitles_timecode(end_time),
4057 parse_node(para)))
4058
4059 return ''.join(out)
4060
4061
4062 def cli_option(params, command_option, param, separator=None):
4063 param = params.get(param)
4064 return ([] if param is None
4065 else [command_option, str(param)] if separator is None
4066 else [f'{command_option}{separator}{param}'])
4067
4068
4069 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4070 param = params.get(param)
4071 assert param in (True, False, None)
4072 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
4073
4074
4075 def cli_valueless_option(params, command_option, param, expected_value=True):
4076 return [command_option] if params.get(param) == expected_value else []
4077
4078
4079 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
4080 if isinstance(argdict, (list, tuple)): # for backward compatibility
4081 if use_compat:
4082 return argdict
4083 else:
4084 argdict = None
4085 if argdict is None:
4086 return default
4087 assert isinstance(argdict, dict)
4088
4089 assert isinstance(keys, (list, tuple))
4090 for key_list in keys:
4091 arg_list = list(filter(
4092 lambda x: x is not None,
4093 [argdict.get(key.lower()) for key in variadic(key_list)]))
4094 if arg_list:
4095 return [arg for args in arg_list for arg in args]
4096 return default
4097
4098
4099 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4100 main_key, exe = main_key.lower(), exe.lower()
4101 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4102 keys = [f'{root_key}{k}' for k in (keys or [''])]
4103 if root_key in keys:
4104 if main_key != exe:
4105 keys.append((main_key, exe))
4106 keys.append('default')
4107 else:
4108 use_compat = False
4109 return cli_configuration_args(argdict, keys, default, use_compat)
4110
4111
4112 class ISO639Utils:
4113 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4114 _lang_map = {
4115 'aa': 'aar',
4116 'ab': 'abk',
4117 'ae': 'ave',
4118 'af': 'afr',
4119 'ak': 'aka',
4120 'am': 'amh',
4121 'an': 'arg',
4122 'ar': 'ara',
4123 'as': 'asm',
4124 'av': 'ava',
4125 'ay': 'aym',
4126 'az': 'aze',
4127 'ba': 'bak',
4128 'be': 'bel',
4129 'bg': 'bul',
4130 'bh': 'bih',
4131 'bi': 'bis',
4132 'bm': 'bam',
4133 'bn': 'ben',
4134 'bo': 'bod',
4135 'br': 'bre',
4136 'bs': 'bos',
4137 'ca': 'cat',
4138 'ce': 'che',
4139 'ch': 'cha',
4140 'co': 'cos',
4141 'cr': 'cre',
4142 'cs': 'ces',
4143 'cu': 'chu',
4144 'cv': 'chv',
4145 'cy': 'cym',
4146 'da': 'dan',
4147 'de': 'deu',
4148 'dv': 'div',
4149 'dz': 'dzo',
4150 'ee': 'ewe',
4151 'el': 'ell',
4152 'en': 'eng',
4153 'eo': 'epo',
4154 'es': 'spa',
4155 'et': 'est',
4156 'eu': 'eus',
4157 'fa': 'fas',
4158 'ff': 'ful',
4159 'fi': 'fin',
4160 'fj': 'fij',
4161 'fo': 'fao',
4162 'fr': 'fra',
4163 'fy': 'fry',
4164 'ga': 'gle',
4165 'gd': 'gla',
4166 'gl': 'glg',
4167 'gn': 'grn',
4168 'gu': 'guj',
4169 'gv': 'glv',
4170 'ha': 'hau',
4171 'he': 'heb',
4172 'iw': 'heb', # Replaced by he in 1989 revision
4173 'hi': 'hin',
4174 'ho': 'hmo',
4175 'hr': 'hrv',
4176 'ht': 'hat',
4177 'hu': 'hun',
4178 'hy': 'hye',
4179 'hz': 'her',
4180 'ia': 'ina',
4181 'id': 'ind',
4182 'in': 'ind', # Replaced by id in 1989 revision
4183 'ie': 'ile',
4184 'ig': 'ibo',
4185 'ii': 'iii',
4186 'ik': 'ipk',
4187 'io': 'ido',
4188 'is': 'isl',
4189 'it': 'ita',
4190 'iu': 'iku',
4191 'ja': 'jpn',
4192 'jv': 'jav',
4193 'ka': 'kat',
4194 'kg': 'kon',
4195 'ki': 'kik',
4196 'kj': 'kua',
4197 'kk': 'kaz',
4198 'kl': 'kal',
4199 'km': 'khm',
4200 'kn': 'kan',
4201 'ko': 'kor',
4202 'kr': 'kau',
4203 'ks': 'kas',
4204 'ku': 'kur',
4205 'kv': 'kom',
4206 'kw': 'cor',
4207 'ky': 'kir',
4208 'la': 'lat',
4209 'lb': 'ltz',
4210 'lg': 'lug',
4211 'li': 'lim',
4212 'ln': 'lin',
4213 'lo': 'lao',
4214 'lt': 'lit',
4215 'lu': 'lub',
4216 'lv': 'lav',
4217 'mg': 'mlg',
4218 'mh': 'mah',
4219 'mi': 'mri',
4220 'mk': 'mkd',
4221 'ml': 'mal',
4222 'mn': 'mon',
4223 'mr': 'mar',
4224 'ms': 'msa',
4225 'mt': 'mlt',
4226 'my': 'mya',
4227 'na': 'nau',
4228 'nb': 'nob',
4229 'nd': 'nde',
4230 'ne': 'nep',
4231 'ng': 'ndo',
4232 'nl': 'nld',
4233 'nn': 'nno',
4234 'no': 'nor',
4235 'nr': 'nbl',
4236 'nv': 'nav',
4237 'ny': 'nya',
4238 'oc': 'oci',
4239 'oj': 'oji',
4240 'om': 'orm',
4241 'or': 'ori',
4242 'os': 'oss',
4243 'pa': 'pan',
4244 'pi': 'pli',
4245 'pl': 'pol',
4246 'ps': 'pus',
4247 'pt': 'por',
4248 'qu': 'que',
4249 'rm': 'roh',
4250 'rn': 'run',
4251 'ro': 'ron',
4252 'ru': 'rus',
4253 'rw': 'kin',
4254 'sa': 'san',
4255 'sc': 'srd',
4256 'sd': 'snd',
4257 'se': 'sme',
4258 'sg': 'sag',
4259 'si': 'sin',
4260 'sk': 'slk',
4261 'sl': 'slv',
4262 'sm': 'smo',
4263 'sn': 'sna',
4264 'so': 'som',
4265 'sq': 'sqi',
4266 'sr': 'srp',
4267 'ss': 'ssw',
4268 'st': 'sot',
4269 'su': 'sun',
4270 'sv': 'swe',
4271 'sw': 'swa',
4272 'ta': 'tam',
4273 'te': 'tel',
4274 'tg': 'tgk',
4275 'th': 'tha',
4276 'ti': 'tir',
4277 'tk': 'tuk',
4278 'tl': 'tgl',
4279 'tn': 'tsn',
4280 'to': 'ton',
4281 'tr': 'tur',
4282 'ts': 'tso',
4283 'tt': 'tat',
4284 'tw': 'twi',
4285 'ty': 'tah',
4286 'ug': 'uig',
4287 'uk': 'ukr',
4288 'ur': 'urd',
4289 'uz': 'uzb',
4290 've': 'ven',
4291 'vi': 'vie',
4292 'vo': 'vol',
4293 'wa': 'wln',
4294 'wo': 'wol',
4295 'xh': 'xho',
4296 'yi': 'yid',
4297 'ji': 'yid', # Replaced by yi in 1989 revision
4298 'yo': 'yor',
4299 'za': 'zha',
4300 'zh': 'zho',
4301 'zu': 'zul',
4302 }
4303
4304 @classmethod
4305 def short2long(cls, code):
4306 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4307 return cls._lang_map.get(code[:2])
4308
4309 @classmethod
4310 def long2short(cls, code):
4311 """Convert language code from ISO 639-2/T to ISO 639-1"""
4312 for short_name, long_name in cls._lang_map.items():
4313 if long_name == code:
4314 return short_name
4315
4316
4317 class ISO3166Utils:
4318 # From http://data.okfn.org/data/core/country-list
4319 _country_map = {
4320 'AF': 'Afghanistan',
4321 'AX': 'Åland Islands',
4322 'AL': 'Albania',
4323 'DZ': 'Algeria',
4324 'AS': 'American Samoa',
4325 'AD': 'Andorra',
4326 'AO': 'Angola',
4327 'AI': 'Anguilla',
4328 'AQ': 'Antarctica',
4329 'AG': 'Antigua and Barbuda',
4330 'AR': 'Argentina',
4331 'AM': 'Armenia',
4332 'AW': 'Aruba',
4333 'AU': 'Australia',
4334 'AT': 'Austria',
4335 'AZ': 'Azerbaijan',
4336 'BS': 'Bahamas',
4337 'BH': 'Bahrain',
4338 'BD': 'Bangladesh',
4339 'BB': 'Barbados',
4340 'BY': 'Belarus',
4341 'BE': 'Belgium',
4342 'BZ': 'Belize',
4343 'BJ': 'Benin',
4344 'BM': 'Bermuda',
4345 'BT': 'Bhutan',
4346 'BO': 'Bolivia, Plurinational State of',
4347 'BQ': 'Bonaire, Sint Eustatius and Saba',
4348 'BA': 'Bosnia and Herzegovina',
4349 'BW': 'Botswana',
4350 'BV': 'Bouvet Island',
4351 'BR': 'Brazil',
4352 'IO': 'British Indian Ocean Territory',
4353 'BN': 'Brunei Darussalam',
4354 'BG': 'Bulgaria',
4355 'BF': 'Burkina Faso',
4356 'BI': 'Burundi',
4357 'KH': 'Cambodia',
4358 'CM': 'Cameroon',
4359 'CA': 'Canada',
4360 'CV': 'Cape Verde',
4361 'KY': 'Cayman Islands',
4362 'CF': 'Central African Republic',
4363 'TD': 'Chad',
4364 'CL': 'Chile',
4365 'CN': 'China',
4366 'CX': 'Christmas Island',
4367 'CC': 'Cocos (Keeling) Islands',
4368 'CO': 'Colombia',
4369 'KM': 'Comoros',
4370 'CG': 'Congo',
4371 'CD': 'Congo, the Democratic Republic of the',
4372 'CK': 'Cook Islands',
4373 'CR': 'Costa Rica',
4374 'CI': 'Côte d\'Ivoire',
4375 'HR': 'Croatia',
4376 'CU': 'Cuba',
4377 'CW': 'Curaçao',
4378 'CY': 'Cyprus',
4379 'CZ': 'Czech Republic',
4380 'DK': 'Denmark',
4381 'DJ': 'Djibouti',
4382 'DM': 'Dominica',
4383 'DO': 'Dominican Republic',
4384 'EC': 'Ecuador',
4385 'EG': 'Egypt',
4386 'SV': 'El Salvador',
4387 'GQ': 'Equatorial Guinea',
4388 'ER': 'Eritrea',
4389 'EE': 'Estonia',
4390 'ET': 'Ethiopia',
4391 'FK': 'Falkland Islands (Malvinas)',
4392 'FO': 'Faroe Islands',
4393 'FJ': 'Fiji',
4394 'FI': 'Finland',
4395 'FR': 'France',
4396 'GF': 'French Guiana',
4397 'PF': 'French Polynesia',
4398 'TF': 'French Southern Territories',
4399 'GA': 'Gabon',
4400 'GM': 'Gambia',
4401 'GE': 'Georgia',
4402 'DE': 'Germany',
4403 'GH': 'Ghana',
4404 'GI': 'Gibraltar',
4405 'GR': 'Greece',
4406 'GL': 'Greenland',
4407 'GD': 'Grenada',
4408 'GP': 'Guadeloupe',
4409 'GU': 'Guam',
4410 'GT': 'Guatemala',
4411 'GG': 'Guernsey',
4412 'GN': 'Guinea',
4413 'GW': 'Guinea-Bissau',
4414 'GY': 'Guyana',
4415 'HT': 'Haiti',
4416 'HM': 'Heard Island and McDonald Islands',
4417 'VA': 'Holy See (Vatican City State)',
4418 'HN': 'Honduras',
4419 'HK': 'Hong Kong',
4420 'HU': 'Hungary',
4421 'IS': 'Iceland',
4422 'IN': 'India',
4423 'ID': 'Indonesia',
4424 'IR': 'Iran, Islamic Republic of',
4425 'IQ': 'Iraq',
4426 'IE': 'Ireland',
4427 'IM': 'Isle of Man',
4428 'IL': 'Israel',
4429 'IT': 'Italy',
4430 'JM': 'Jamaica',
4431 'JP': 'Japan',
4432 'JE': 'Jersey',
4433 'JO': 'Jordan',
4434 'KZ': 'Kazakhstan',
4435 'KE': 'Kenya',
4436 'KI': 'Kiribati',
4437 'KP': 'Korea, Democratic People\'s Republic of',
4438 'KR': 'Korea, Republic of',
4439 'KW': 'Kuwait',
4440 'KG': 'Kyrgyzstan',
4441 'LA': 'Lao People\'s Democratic Republic',
4442 'LV': 'Latvia',
4443 'LB': 'Lebanon',
4444 'LS': 'Lesotho',
4445 'LR': 'Liberia',
4446 'LY': 'Libya',
4447 'LI': 'Liechtenstein',
4448 'LT': 'Lithuania',
4449 'LU': 'Luxembourg',
4450 'MO': 'Macao',
4451 'MK': 'Macedonia, the Former Yugoslav Republic of',
4452 'MG': 'Madagascar',
4453 'MW': 'Malawi',
4454 'MY': 'Malaysia',
4455 'MV': 'Maldives',
4456 'ML': 'Mali',
4457 'MT': 'Malta',
4458 'MH': 'Marshall Islands',
4459 'MQ': 'Martinique',
4460 'MR': 'Mauritania',
4461 'MU': 'Mauritius',
4462 'YT': 'Mayotte',
4463 'MX': 'Mexico',
4464 'FM': 'Micronesia, Federated States of',
4465 'MD': 'Moldova, Republic of',
4466 'MC': 'Monaco',
4467 'MN': 'Mongolia',
4468 'ME': 'Montenegro',
4469 'MS': 'Montserrat',
4470 'MA': 'Morocco',
4471 'MZ': 'Mozambique',
4472 'MM': 'Myanmar',
4473 'NA': 'Namibia',
4474 'NR': 'Nauru',
4475 'NP': 'Nepal',
4476 'NL': 'Netherlands',
4477 'NC': 'New Caledonia',
4478 'NZ': 'New Zealand',
4479 'NI': 'Nicaragua',
4480 'NE': 'Niger',
4481 'NG': 'Nigeria',
4482 'NU': 'Niue',
4483 'NF': 'Norfolk Island',
4484 'MP': 'Northern Mariana Islands',
4485 'NO': 'Norway',
4486 'OM': 'Oman',
4487 'PK': 'Pakistan',
4488 'PW': 'Palau',
4489 'PS': 'Palestine, State of',
4490 'PA': 'Panama',
4491 'PG': 'Papua New Guinea',
4492 'PY': 'Paraguay',
4493 'PE': 'Peru',
4494 'PH': 'Philippines',
4495 'PN': 'Pitcairn',
4496 'PL': 'Poland',
4497 'PT': 'Portugal',
4498 'PR': 'Puerto Rico',
4499 'QA': 'Qatar',
4500 'RE': 'Réunion',
4501 'RO': 'Romania',
4502 'RU': 'Russian Federation',
4503 'RW': 'Rwanda',
4504 'BL': 'Saint Barthélemy',
4505 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4506 'KN': 'Saint Kitts and Nevis',
4507 'LC': 'Saint Lucia',
4508 'MF': 'Saint Martin (French part)',
4509 'PM': 'Saint Pierre and Miquelon',
4510 'VC': 'Saint Vincent and the Grenadines',
4511 'WS': 'Samoa',
4512 'SM': 'San Marino',
4513 'ST': 'Sao Tome and Principe',
4514 'SA': 'Saudi Arabia',
4515 'SN': 'Senegal',
4516 'RS': 'Serbia',
4517 'SC': 'Seychelles',
4518 'SL': 'Sierra Leone',
4519 'SG': 'Singapore',
4520 'SX': 'Sint Maarten (Dutch part)',
4521 'SK': 'Slovakia',
4522 'SI': 'Slovenia',
4523 'SB': 'Solomon Islands',
4524 'SO': 'Somalia',
4525 'ZA': 'South Africa',
4526 'GS': 'South Georgia and the South Sandwich Islands',
4527 'SS': 'South Sudan',
4528 'ES': 'Spain',
4529 'LK': 'Sri Lanka',
4530 'SD': 'Sudan',
4531 'SR': 'Suriname',
4532 'SJ': 'Svalbard and Jan Mayen',
4533 'SZ': 'Swaziland',
4534 'SE': 'Sweden',
4535 'CH': 'Switzerland',
4536 'SY': 'Syrian Arab Republic',
4537 'TW': 'Taiwan, Province of China',
4538 'TJ': 'Tajikistan',
4539 'TZ': 'Tanzania, United Republic of',
4540 'TH': 'Thailand',
4541 'TL': 'Timor-Leste',
4542 'TG': 'Togo',
4543 'TK': 'Tokelau',
4544 'TO': 'Tonga',
4545 'TT': 'Trinidad and Tobago',
4546 'TN': 'Tunisia',
4547 'TR': 'Turkey',
4548 'TM': 'Turkmenistan',
4549 'TC': 'Turks and Caicos Islands',
4550 'TV': 'Tuvalu',
4551 'UG': 'Uganda',
4552 'UA': 'Ukraine',
4553 'AE': 'United Arab Emirates',
4554 'GB': 'United Kingdom',
4555 'US': 'United States',
4556 'UM': 'United States Minor Outlying Islands',
4557 'UY': 'Uruguay',
4558 'UZ': 'Uzbekistan',
4559 'VU': 'Vanuatu',
4560 'VE': 'Venezuela, Bolivarian Republic of',
4561 'VN': 'Viet Nam',
4562 'VG': 'Virgin Islands, British',
4563 'VI': 'Virgin Islands, U.S.',
4564 'WF': 'Wallis and Futuna',
4565 'EH': 'Western Sahara',
4566 'YE': 'Yemen',
4567 'ZM': 'Zambia',
4568 'ZW': 'Zimbabwe',
4569 # Not ISO 3166 codes, but used for IP blocks
4570 'AP': 'Asia/Pacific Region',
4571 'EU': 'Europe',
4572 }
4573
4574 @classmethod
4575 def short2full(cls, code):
4576 """Convert an ISO 3166-2 country code to the corresponding full name"""
4577 return cls._country_map.get(code.upper())
4578
4579
4580 class GeoUtils:
4581 # Major IPv4 address blocks per country
4582 _country_ip_map = {
4583 'AD': '46.172.224.0/19',
4584 'AE': '94.200.0.0/13',
4585 'AF': '149.54.0.0/17',
4586 'AG': '209.59.64.0/18',
4587 'AI': '204.14.248.0/21',
4588 'AL': '46.99.0.0/16',
4589 'AM': '46.70.0.0/15',
4590 'AO': '105.168.0.0/13',
4591 'AP': '182.50.184.0/21',
4592 'AQ': '23.154.160.0/24',
4593 'AR': '181.0.0.0/12',
4594 'AS': '202.70.112.0/20',
4595 'AT': '77.116.0.0/14',
4596 'AU': '1.128.0.0/11',
4597 'AW': '181.41.0.0/18',
4598 'AX': '185.217.4.0/22',
4599 'AZ': '5.197.0.0/16',
4600 'BA': '31.176.128.0/17',
4601 'BB': '65.48.128.0/17',
4602 'BD': '114.130.0.0/16',
4603 'BE': '57.0.0.0/8',
4604 'BF': '102.178.0.0/15',
4605 'BG': '95.42.0.0/15',
4606 'BH': '37.131.0.0/17',
4607 'BI': '154.117.192.0/18',
4608 'BJ': '137.255.0.0/16',
4609 'BL': '185.212.72.0/23',
4610 'BM': '196.12.64.0/18',
4611 'BN': '156.31.0.0/16',
4612 'BO': '161.56.0.0/16',
4613 'BQ': '161.0.80.0/20',
4614 'BR': '191.128.0.0/12',
4615 'BS': '24.51.64.0/18',
4616 'BT': '119.2.96.0/19',
4617 'BW': '168.167.0.0/16',
4618 'BY': '178.120.0.0/13',
4619 'BZ': '179.42.192.0/18',
4620 'CA': '99.224.0.0/11',
4621 'CD': '41.243.0.0/16',
4622 'CF': '197.242.176.0/21',
4623 'CG': '160.113.0.0/16',
4624 'CH': '85.0.0.0/13',
4625 'CI': '102.136.0.0/14',
4626 'CK': '202.65.32.0/19',
4627 'CL': '152.172.0.0/14',
4628 'CM': '102.244.0.0/14',
4629 'CN': '36.128.0.0/10',
4630 'CO': '181.240.0.0/12',
4631 'CR': '201.192.0.0/12',
4632 'CU': '152.206.0.0/15',
4633 'CV': '165.90.96.0/19',
4634 'CW': '190.88.128.0/17',
4635 'CY': '31.153.0.0/16',
4636 'CZ': '88.100.0.0/14',
4637 'DE': '53.0.0.0/8',
4638 'DJ': '197.241.0.0/17',
4639 'DK': '87.48.0.0/12',
4640 'DM': '192.243.48.0/20',
4641 'DO': '152.166.0.0/15',
4642 'DZ': '41.96.0.0/12',
4643 'EC': '186.68.0.0/15',
4644 'EE': '90.190.0.0/15',
4645 'EG': '156.160.0.0/11',
4646 'ER': '196.200.96.0/20',
4647 'ES': '88.0.0.0/11',
4648 'ET': '196.188.0.0/14',
4649 'EU': '2.16.0.0/13',
4650 'FI': '91.152.0.0/13',
4651 'FJ': '144.120.0.0/16',
4652 'FK': '80.73.208.0/21',
4653 'FM': '119.252.112.0/20',
4654 'FO': '88.85.32.0/19',
4655 'FR': '90.0.0.0/9',
4656 'GA': '41.158.0.0/15',
4657 'GB': '25.0.0.0/8',
4658 'GD': '74.122.88.0/21',
4659 'GE': '31.146.0.0/16',
4660 'GF': '161.22.64.0/18',
4661 'GG': '62.68.160.0/19',
4662 'GH': '154.160.0.0/12',
4663 'GI': '95.164.0.0/16',
4664 'GL': '88.83.0.0/19',
4665 'GM': '160.182.0.0/15',
4666 'GN': '197.149.192.0/18',
4667 'GP': '104.250.0.0/19',
4668 'GQ': '105.235.224.0/20',
4669 'GR': '94.64.0.0/13',
4670 'GT': '168.234.0.0/16',
4671 'GU': '168.123.0.0/16',
4672 'GW': '197.214.80.0/20',
4673 'GY': '181.41.64.0/18',
4674 'HK': '113.252.0.0/14',
4675 'HN': '181.210.0.0/16',
4676 'HR': '93.136.0.0/13',
4677 'HT': '148.102.128.0/17',
4678 'HU': '84.0.0.0/14',
4679 'ID': '39.192.0.0/10',
4680 'IE': '87.32.0.0/12',
4681 'IL': '79.176.0.0/13',
4682 'IM': '5.62.80.0/20',
4683 'IN': '117.192.0.0/10',
4684 'IO': '203.83.48.0/21',
4685 'IQ': '37.236.0.0/14',
4686 'IR': '2.176.0.0/12',
4687 'IS': '82.221.0.0/16',
4688 'IT': '79.0.0.0/10',
4689 'JE': '87.244.64.0/18',
4690 'JM': '72.27.0.0/17',
4691 'JO': '176.29.0.0/16',
4692 'JP': '133.0.0.0/8',
4693 'KE': '105.48.0.0/12',
4694 'KG': '158.181.128.0/17',
4695 'KH': '36.37.128.0/17',
4696 'KI': '103.25.140.0/22',
4697 'KM': '197.255.224.0/20',
4698 'KN': '198.167.192.0/19',
4699 'KP': '175.45.176.0/22',
4700 'KR': '175.192.0.0/10',
4701 'KW': '37.36.0.0/14',
4702 'KY': '64.96.0.0/15',
4703 'KZ': '2.72.0.0/13',
4704 'LA': '115.84.64.0/18',
4705 'LB': '178.135.0.0/16',
4706 'LC': '24.92.144.0/20',
4707 'LI': '82.117.0.0/19',
4708 'LK': '112.134.0.0/15',
4709 'LR': '102.183.0.0/16',
4710 'LS': '129.232.0.0/17',
4711 'LT': '78.56.0.0/13',
4712 'LU': '188.42.0.0/16',
4713 'LV': '46.109.0.0/16',
4714 'LY': '41.252.0.0/14',
4715 'MA': '105.128.0.0/11',
4716 'MC': '88.209.64.0/18',
4717 'MD': '37.246.0.0/16',
4718 'ME': '178.175.0.0/17',
4719 'MF': '74.112.232.0/21',
4720 'MG': '154.126.0.0/17',
4721 'MH': '117.103.88.0/21',
4722 'MK': '77.28.0.0/15',
4723 'ML': '154.118.128.0/18',
4724 'MM': '37.111.0.0/17',
4725 'MN': '49.0.128.0/17',
4726 'MO': '60.246.0.0/16',
4727 'MP': '202.88.64.0/20',
4728 'MQ': '109.203.224.0/19',
4729 'MR': '41.188.64.0/18',
4730 'MS': '208.90.112.0/22',
4731 'MT': '46.11.0.0/16',
4732 'MU': '105.16.0.0/12',
4733 'MV': '27.114.128.0/18',
4734 'MW': '102.70.0.0/15',
4735 'MX': '187.192.0.0/11',
4736 'MY': '175.136.0.0/13',
4737 'MZ': '197.218.0.0/15',
4738 'NA': '41.182.0.0/16',
4739 'NC': '101.101.0.0/18',
4740 'NE': '197.214.0.0/18',
4741 'NF': '203.17.240.0/22',
4742 'NG': '105.112.0.0/12',
4743 'NI': '186.76.0.0/15',
4744 'NL': '145.96.0.0/11',
4745 'NO': '84.208.0.0/13',
4746 'NP': '36.252.0.0/15',
4747 'NR': '203.98.224.0/19',
4748 'NU': '49.156.48.0/22',
4749 'NZ': '49.224.0.0/14',
4750 'OM': '5.36.0.0/15',
4751 'PA': '186.72.0.0/15',
4752 'PE': '186.160.0.0/14',
4753 'PF': '123.50.64.0/18',
4754 'PG': '124.240.192.0/19',
4755 'PH': '49.144.0.0/13',
4756 'PK': '39.32.0.0/11',
4757 'PL': '83.0.0.0/11',
4758 'PM': '70.36.0.0/20',
4759 'PR': '66.50.0.0/16',
4760 'PS': '188.161.0.0/16',
4761 'PT': '85.240.0.0/13',
4762 'PW': '202.124.224.0/20',
4763 'PY': '181.120.0.0/14',
4764 'QA': '37.210.0.0/15',
4765 'RE': '102.35.0.0/16',
4766 'RO': '79.112.0.0/13',
4767 'RS': '93.86.0.0/15',
4768 'RU': '5.136.0.0/13',
4769 'RW': '41.186.0.0/16',
4770 'SA': '188.48.0.0/13',
4771 'SB': '202.1.160.0/19',
4772 'SC': '154.192.0.0/11',
4773 'SD': '102.120.0.0/13',
4774 'SE': '78.64.0.0/12',
4775 'SG': '8.128.0.0/10',
4776 'SI': '188.196.0.0/14',
4777 'SK': '78.98.0.0/15',
4778 'SL': '102.143.0.0/17',
4779 'SM': '89.186.32.0/19',
4780 'SN': '41.82.0.0/15',
4781 'SO': '154.115.192.0/18',
4782 'SR': '186.179.128.0/17',
4783 'SS': '105.235.208.0/21',
4784 'ST': '197.159.160.0/19',
4785 'SV': '168.243.0.0/16',
4786 'SX': '190.102.0.0/20',
4787 'SY': '5.0.0.0/16',
4788 'SZ': '41.84.224.0/19',
4789 'TC': '65.255.48.0/20',
4790 'TD': '154.68.128.0/19',
4791 'TG': '196.168.0.0/14',
4792 'TH': '171.96.0.0/13',
4793 'TJ': '85.9.128.0/18',
4794 'TK': '27.96.24.0/21',
4795 'TL': '180.189.160.0/20',
4796 'TM': '95.85.96.0/19',
4797 'TN': '197.0.0.0/11',
4798 'TO': '175.176.144.0/21',
4799 'TR': '78.160.0.0/11',
4800 'TT': '186.44.0.0/15',
4801 'TV': '202.2.96.0/19',
4802 'TW': '120.96.0.0/11',
4803 'TZ': '156.156.0.0/14',
4804 'UA': '37.52.0.0/14',
4805 'UG': '102.80.0.0/13',
4806 'US': '6.0.0.0/8',
4807 'UY': '167.56.0.0/13',
4808 'UZ': '84.54.64.0/18',
4809 'VA': '212.77.0.0/19',
4810 'VC': '207.191.240.0/21',
4811 'VE': '186.88.0.0/13',
4812 'VG': '66.81.192.0/20',
4813 'VI': '146.226.0.0/16',
4814 'VN': '14.160.0.0/11',
4815 'VU': '202.80.32.0/20',
4816 'WF': '117.20.32.0/21',
4817 'WS': '202.4.32.0/19',
4818 'YE': '134.35.0.0/16',
4819 'YT': '41.242.116.0/22',
4820 'ZA': '41.0.0.0/11',
4821 'ZM': '102.144.0.0/13',
4822 'ZW': '102.177.192.0/18',
4823 }
4824
4825 @classmethod
4826 def random_ipv4(cls, code_or_block):
4827 if len(code_or_block) == 2:
4828 block = cls._country_ip_map.get(code_or_block.upper())
4829 if not block:
4830 return None
4831 else:
4832 block = code_or_block
4833 addr, preflen = block.split('/')
4834 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
4835 addr_max = addr_min | (0xffffffff >> int(preflen))
4836 return str(socket.inet_ntoa(
4837 struct.pack('!L', random.randint(addr_min, addr_max))))
4838
4839
4840 class PerRequestProxyHandler(urllib.request.ProxyHandler):
4841 def __init__(self, proxies=None):
4842 # Set default handlers
4843 for type in ('http', 'https'):
4844 setattr(self, '%s_open' % type,
4845 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4846 meth(r, proxy, type))
4847 urllib.request.ProxyHandler.__init__(self, proxies)
4848
4849 def proxy_open(self, req, proxy, type):
4850 req_proxy = req.headers.get('Ytdl-request-proxy')
4851 if req_proxy is not None:
4852 proxy = req_proxy
4853 del req.headers['Ytdl-request-proxy']
4854
4855 if proxy == '__noproxy__':
4856 return None # No Proxy
4857 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4858 req.add_header('Ytdl-socks-proxy', proxy)
4859 # yt-dlp's http/https handlers do wrapping the socket with socks
4860 return None
4861 return urllib.request.ProxyHandler.proxy_open(
4862 self, req, proxy, type)
4863
4864
4865 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4866 # released into Public Domain
4867 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4868
4869 def long_to_bytes(n, blocksize=0):
4870 """long_to_bytes(n:long, blocksize:int) : string
4871 Convert a long integer to a byte string.
4872
4873 If optional blocksize is given and greater than zero, pad the front of the
4874 byte string with binary zeros so that the length is a multiple of
4875 blocksize.
4876 """
4877 # after much testing, this algorithm was deemed to be the fastest
4878 s = b''
4879 n = int(n)
4880 while n > 0:
4881 s = struct.pack('>I', n & 0xffffffff) + s
4882 n = n >> 32
4883 # strip off leading zeros
4884 for i in range(len(s)):
4885 if s[i] != b'\000'[0]:
4886 break
4887 else:
4888 # only happens when n == 0
4889 s = b'\000'
4890 i = 0
4891 s = s[i:]
4892 # add back some pad bytes. this could be done more efficiently w.r.t. the
4893 # de-padding being done above, but sigh...
4894 if blocksize > 0 and len(s) % blocksize:
4895 s = (blocksize - len(s) % blocksize) * b'\000' + s
4896 return s
4897
4898
4899 def bytes_to_long(s):
4900 """bytes_to_long(string) : long
4901 Convert a byte string to a long integer.
4902
4903 This is (essentially) the inverse of long_to_bytes().
4904 """
4905 acc = 0
4906 length = len(s)
4907 if length % 4:
4908 extra = (4 - length % 4)
4909 s = b'\000' * extra + s
4910 length = length + extra
4911 for i in range(0, length, 4):
4912 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
4913 return acc
4914
4915
4916 def ohdave_rsa_encrypt(data, exponent, modulus):
4917 '''
4918 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4919
4920 Input:
4921 data: data to encrypt, bytes-like object
4922 exponent, modulus: parameter e and N of RSA algorithm, both integer
4923 Output: hex string of encrypted data
4924
4925 Limitation: supports one block encryption only
4926 '''
4927
4928 payload = int(binascii.hexlify(data[::-1]), 16)
4929 encrypted = pow(payload, exponent, modulus)
4930 return '%x' % encrypted
4931
4932
4933 def pkcs1pad(data, length):
4934 """
4935 Padding input data with PKCS#1 scheme
4936
4937 @param {int[]} data input data
4938 @param {int} length target length
4939 @returns {int[]} padded data
4940 """
4941 if len(data) > length - 11:
4942 raise ValueError('Input data too long for PKCS#1 padding')
4943
4944 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4945 return [0, 2] + pseudo_random + [0] + data
4946
4947
4948 def _base_n_table(n, table):
4949 if not table and not n:
4950 raise ValueError('Either table or n must be specified')
4951 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4952
4953 if n and n != len(table):
4954 raise ValueError(f'base {n} exceeds table length {len(table)}')
4955 return table
4956
4957
4958 def encode_base_n(num, n=None, table=None):
4959 """Convert given int to a base-n string"""
4960 table = _base_n_table(n, table)
4961 if not num:
4962 return table[0]
4963
4964 result, base = '', len(table)
4965 while num:
4966 result = table[num % base] + result
4967 num = num // base
4968 return result
4969
4970
4971 def decode_base_n(string, n=None, table=None):
4972 """Convert given base-n string to int"""
4973 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4974 result, base = 0, len(table)
4975 for char in string:
4976 result = result * base + table[char]
4977 return result
4978
4979
4980 def decode_base(value, digits):
4981 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4982 f'in a future version. Use {__name__}.decode_base_n instead')
4983 return decode_base_n(value, table=digits)
4984
4985
4986 def decode_packed_codes(code):
4987 mobj = re.search(PACKED_CODES_RE, code)
4988 obfuscated_code, base, count, symbols = mobj.groups()
4989 base = int(base)
4990 count = int(count)
4991 symbols = symbols.split('|')
4992 symbol_table = {}
4993
4994 while count:
4995 count -= 1
4996 base_n_count = encode_base_n(count, base)
4997 symbol_table[base_n_count] = symbols[count] or base_n_count
4998
4999 return re.sub(
5000 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5001 obfuscated_code)
5002
5003
5004 def caesar(s, alphabet, shift):
5005 if shift == 0:
5006 return s
5007 l = len(alphabet)
5008 return ''.join(
5009 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5010 for c in s)
5011
5012
5013 def rot47(s):
5014 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5015
5016
5017 def parse_m3u8_attributes(attrib):
5018 info = {}
5019 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5020 if val.startswith('"'):
5021 val = val[1:-1]
5022 info[key] = val
5023 return info
5024
5025
5026 def urshift(val, n):
5027 return val >> n if val >= 0 else (val + 0x100000000) >> n
5028
5029
5030 # Based on png2str() written by @gdkchan and improved by @yokrysty
5031 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5032 def decode_png(png_data):
5033 # Reference: https://www.w3.org/TR/PNG/
5034 header = png_data[8:]
5035
5036 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5037 raise OSError('Not a valid PNG file.')
5038
5039 int_map = {1: '>B', 2: '>H', 4: '>I'}
5040 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
5041
5042 chunks = []
5043
5044 while header:
5045 length = unpack_integer(header[:4])
5046 header = header[4:]
5047
5048 chunk_type = header[:4]
5049 header = header[4:]
5050
5051 chunk_data = header[:length]
5052 header = header[length:]
5053
5054 header = header[4:] # Skip CRC
5055
5056 chunks.append({
5057 'type': chunk_type,
5058 'length': length,
5059 'data': chunk_data
5060 })
5061
5062 ihdr = chunks[0]['data']
5063
5064 width = unpack_integer(ihdr[:4])
5065 height = unpack_integer(ihdr[4:8])
5066
5067 idat = b''
5068
5069 for chunk in chunks:
5070 if chunk['type'] == b'IDAT':
5071 idat += chunk['data']
5072
5073 if not idat:
5074 raise OSError('Unable to read PNG data.')
5075
5076 decompressed_data = bytearray(zlib.decompress(idat))
5077
5078 stride = width * 3
5079 pixels = []
5080
5081 def _get_pixel(idx):
5082 x = idx % stride
5083 y = idx // stride
5084 return pixels[y][x]
5085
5086 for y in range(height):
5087 basePos = y * (1 + stride)
5088 filter_type = decompressed_data[basePos]
5089
5090 current_row = []
5091
5092 pixels.append(current_row)
5093
5094 for x in range(stride):
5095 color = decompressed_data[1 + basePos + x]
5096 basex = y * stride + x
5097 left = 0
5098 up = 0
5099
5100 if x > 2:
5101 left = _get_pixel(basex - 3)
5102 if y > 0:
5103 up = _get_pixel(basex - stride)
5104
5105 if filter_type == 1: # Sub
5106 color = (color + left) & 0xff
5107 elif filter_type == 2: # Up
5108 color = (color + up) & 0xff
5109 elif filter_type == 3: # Average
5110 color = (color + ((left + up) >> 1)) & 0xff
5111 elif filter_type == 4: # Paeth
5112 a = left
5113 b = up
5114 c = 0
5115
5116 if x > 2 and y > 0:
5117 c = _get_pixel(basex - stride - 3)
5118
5119 p = a + b - c
5120
5121 pa = abs(p - a)
5122 pb = abs(p - b)
5123 pc = abs(p - c)
5124
5125 if pa <= pb and pa <= pc:
5126 color = (color + a) & 0xff
5127 elif pb <= pc:
5128 color = (color + b) & 0xff
5129 else:
5130 color = (color + c) & 0xff
5131
5132 current_row.append(color)
5133
5134 return width, height, pixels
5135
5136
5137 def write_xattr(path, key, value):
5138 # Windows: Write xattrs to NTFS Alternate Data Streams:
5139 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5140 if compat_os_name == 'nt':
5141 assert ':' not in key
5142 assert os.path.exists(path)
5143
5144 try:
5145 with open(f'{path}:{key}', 'wb') as f:
5146 f.write(value)
5147 except OSError as e:
5148 raise XAttrMetadataError(e.errno, e.strerror)
5149 return
5150
5151 # UNIX Method 1. Use xattrs/pyxattrs modules
5152
5153 setxattr = None
5154 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5155 # Unicode arguments are not supported in pyxattr until version 0.5.0
5156 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5157 if version_tuple(xattr.__version__) >= (0, 5, 0):
5158 setxattr = xattr.set
5159 elif xattr:
5160 setxattr = xattr.setxattr
5161
5162 if setxattr:
5163 try:
5164 setxattr(path, key, value)
5165 except OSError as e:
5166 raise XAttrMetadataError(e.errno, e.strerror)
5167 return
5168
5169 # UNIX Method 2. Use setfattr/xattr executables
5170 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5171 else 'xattr' if check_executable('xattr', ['-h']) else None)
5172 if not exe:
5173 raise XAttrUnavailableError(
5174 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5175 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
5176
5177 value = value.decode()
5178 try:
5179 _, stderr, returncode = Popen.run(
5180 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
5181 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5182 except OSError as e:
5183 raise XAttrMetadataError(e.errno, e.strerror)
5184 if returncode:
5185 raise XAttrMetadataError(returncode, stderr)
5186
5187
5188 def random_birthday(year_field, month_field, day_field):
5189 start_date = datetime.date(1950, 1, 1)
5190 end_date = datetime.date(1995, 12, 31)
5191 offset = random.randint(0, (end_date - start_date).days)
5192 random_date = start_date + datetime.timedelta(offset)
5193 return {
5194 year_field: str(random_date.year),
5195 month_field: str(random_date.month),
5196 day_field: str(random_date.day),
5197 }
5198
5199
5200 # Templates for internet shortcut files, which are plain text files.
5201 DOT_URL_LINK_TEMPLATE = '''\
5202 [InternetShortcut]
5203 URL=%(url)s
5204 '''
5205
5206 DOT_WEBLOC_LINK_TEMPLATE = '''\
5207 <?xml version="1.0" encoding="UTF-8"?>
5208 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5209 <plist version="1.0">
5210 <dict>
5211 \t<key>URL</key>
5212 \t<string>%(url)s</string>
5213 </dict>
5214 </plist>
5215 '''
5216
5217 DOT_DESKTOP_LINK_TEMPLATE = '''\
5218 [Desktop Entry]
5219 Encoding=UTF-8
5220 Name=%(filename)s
5221 Type=Link
5222 URL=%(url)s
5223 Icon=text-html
5224 '''
5225
5226 LINK_TEMPLATES = {
5227 'url': DOT_URL_LINK_TEMPLATE,
5228 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5229 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5230 }
5231
5232
5233 def iri_to_uri(iri):
5234 """
5235 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5236
5237 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5238 """
5239
5240 iri_parts = urllib.parse.urlparse(iri)
5241
5242 if '[' in iri_parts.netloc:
5243 raise ValueError('IPv6 URIs are not, yet, supported.')
5244 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5245
5246 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5247
5248 net_location = ''
5249 if iri_parts.username:
5250 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
5251 if iri_parts.password is not None:
5252 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
5253 net_location += '@'
5254
5255 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
5256 # The 'idna' encoding produces ASCII text.
5257 if iri_parts.port is not None and iri_parts.port != 80:
5258 net_location += ':' + str(iri_parts.port)
5259
5260 return urllib.parse.urlunparse(
5261 (iri_parts.scheme,
5262 net_location,
5263
5264 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5265
5266 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5267 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5268
5269 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5270 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5271
5272 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5273
5274 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5275
5276
5277 def to_high_limit_path(path):
5278 if sys.platform in ['win32', 'cygwin']:
5279 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5280 return '\\\\?\\' + os.path.abspath(path)
5281
5282 return path
5283
5284
5285 def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
5286 val = traverse_obj(obj, *variadic(field))
5287 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
5288 return default
5289 return template % func(val)
5290
5291
5292 def clean_podcast_url(url):
5293 return re.sub(r'''(?x)
5294 (?:
5295 (?:
5296 chtbl\.com/track|
5297 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5298 play\.podtrac\.com
5299 )/[^/]+|
5300 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5301 flex\.acast\.com|
5302 pd(?:
5303 cn\.co| # https://podcorn.com/analytics-prefix/
5304 st\.fm # https://podsights.com/docs/
5305 )/e
5306 )/''', '', url)
5307
5308
5309 _HEX_TABLE = '0123456789abcdef'
5310
5311
5312 def random_uuidv4():
5313 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5314
5315
5316 def make_dir(path, to_screen=None):
5317 try:
5318 dn = os.path.dirname(path)
5319 if dn and not os.path.exists(dn):
5320 os.makedirs(dn)
5321 return True
5322 except OSError as err:
5323 if callable(to_screen) is not None:
5324 to_screen('unable to create directory ' + error_to_compat_str(err))
5325 return False
5326
5327
5328 def get_executable_path():
5329 from .update import _get_variant_and_executable_path
5330
5331 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
5332
5333
5334 def load_plugins(name, suffix, namespace):
5335 classes = {}
5336 with contextlib.suppress(FileNotFoundError):
5337 plugins_spec = importlib.util.spec_from_file_location(
5338 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5339 plugins = importlib.util.module_from_spec(plugins_spec)
5340 sys.modules[plugins_spec.name] = plugins
5341 plugins_spec.loader.exec_module(plugins)
5342 for name in dir(plugins):
5343 if name in namespace:
5344 continue
5345 if not name.endswith(suffix):
5346 continue
5347 klass = getattr(plugins, name)
5348 classes[name] = namespace[name] = klass
5349 return classes
5350
5351
5352 def traverse_obj(
5353 obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True,
5354 casesense=True, is_user_input=False, traverse_string=False):
5355 """
5356 Safely traverse nested `dict`s and `Sequence`s
5357
5358 >>> obj = [{}, {"key": "value"}]
5359 >>> traverse_obj(obj, (1, "key"))
5360 "value"
5361
5362 Each of the provided `paths` is tested and the first producing a valid result will be returned.
5363 The next path will also be tested if the path branched but no results could be found.
5364 Supported values for traversal are `Mapping`, `Sequence` and `re.Match`.
5365 A value of None is treated as the absence of a value.
5366
5367 The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`.
5368
5369 The keys in the path can be one of:
5370 - `None`: Return the current object.
5371 - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`.
5372 - `slice`: Branch out and return all values in `obj[key]`.
5373 - `Ellipsis`: Branch out and return a list of all values.
5374 - `tuple`/`list`: Branch out and return a list of all matching values.
5375 Read as: `[traverse_obj(obj, branch) for branch in branches]`.
5376 - `function`: Branch out and return values filtered by the function.
5377 Read as: `[value for key, value in obj if function(key, value)]`.
5378 For `Sequence`s, `key` is the index of the value.
5379 - `dict` Transform the current object and return a matching dict.
5380 Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`.
5381
5382 `tuple`, `list`, and `dict` all support nested paths and branches.
5383
5384 @params paths Paths which to traverse by.
5385 @param default Value to return if the paths do not match.
5386 @param expected_type If a `type`, only accept final values of this type.
5387 If any other callable, try to call the function on each result.
5388 @param get_all If `False`, return the first matching result, otherwise all matching ones.
5389 @param casesense If `False`, consider string dictionary keys as case insensitive.
5390
5391 The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API
5392
5393 @param is_user_input Whether the keys are generated from user input.
5394 If `True` strings get converted to `int`/`slice` if needed.
5395 @param traverse_string Whether to traverse into objects as strings.
5396 If `True`, any non-compatible object will first be
5397 converted into a string and then traversed into.
5398
5399
5400 @returns The result of the object traversal.
5401 If successful, `get_all=True`, and the path branches at least once,
5402 then a list of results is returned instead.
5403 A list is always returned if the last path branches and no `default` is given.
5404 """
5405 is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes))
5406 casefold = lambda k: k.casefold() if isinstance(k, str) else k
5407
5408 if isinstance(expected_type, type):
5409 type_test = lambda val: val if isinstance(val, expected_type) else None
5410 else:
5411 type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,))
5412
5413 def apply_key(key, obj):
5414 if obj is None:
5415 return
5416
5417 elif key is None:
5418 yield obj
5419
5420 elif isinstance(key, (list, tuple)):
5421 for branch in key:
5422 _, result = apply_path(obj, branch)
5423 yield from result
5424
5425 elif key is ...:
5426 if isinstance(obj, collections.abc.Mapping):
5427 yield from obj.values()
5428 elif is_sequence(obj):
5429 yield from obj
5430 elif isinstance(obj, re.Match):
5431 yield from obj.groups()
5432 elif traverse_string:
5433 yield from str(obj)
5434
5435 elif callable(key):
5436 if is_sequence(obj):
5437 iter_obj = enumerate(obj)
5438 elif isinstance(obj, collections.abc.Mapping):
5439 iter_obj = obj.items()
5440 elif isinstance(obj, re.Match):
5441 iter_obj = enumerate((obj.group(), *obj.groups()))
5442 elif traverse_string:
5443 iter_obj = enumerate(str(obj))
5444 else:
5445 return
5446 yield from (v for k, v in iter_obj if try_call(key, args=(k, v)))
5447
5448 elif isinstance(key, dict):
5449 iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items())
5450 yield {k: v if v is not None else default for k, v in iter_obj
5451 if v is not None or default is not NO_DEFAULT}
5452
5453 elif isinstance(obj, collections.abc.Mapping):
5454 yield (obj.get(key) if casesense or (key in obj)
5455 else next((v for k, v in obj.items() if casefold(k) == key), None))
5456
5457 elif isinstance(obj, re.Match):
5458 if isinstance(key, int) or casesense:
5459 with contextlib.suppress(IndexError):
5460 yield obj.group(key)
5461 return
5462
5463 if not isinstance(key, str):
5464 return
5465
5466 yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None)
5467
5468 else:
5469 if is_user_input:
5470 key = (int_or_none(key) if ':' not in key
5471 else slice(*map(int_or_none, key.split(':'))))
5472
5473 if not isinstance(key, (int, slice)):
5474 return
5475
5476 if not is_sequence(obj):
5477 if not traverse_string:
5478 return
5479 obj = str(obj)
5480
5481 with contextlib.suppress(IndexError):
5482 yield obj[key]
5483
5484 def apply_path(start_obj, path):
5485 objs = (start_obj,)
5486 has_branched = False
5487
5488 for key in variadic(path):
5489 if is_user_input and key == ':':
5490 key = ...
5491
5492 if not casesense and isinstance(key, str):
5493 key = key.casefold()
5494
5495 if key is ... or isinstance(key, (list, tuple)) or callable(key):
5496 has_branched = True
5497
5498 key_func = functools.partial(apply_key, key)
5499 objs = itertools.chain.from_iterable(map(key_func, objs))
5500
5501 return has_branched, objs
5502
5503 def _traverse_obj(obj, path, use_list=True):
5504 has_branched, results = apply_path(obj, path)
5505 results = LazyList(x for x in map(type_test, results) if x is not None)
5506
5507 if get_all and has_branched:
5508 return results.exhaust() if results or use_list else None
5509
5510 return results[0] if results else None
5511
5512 for index, path in enumerate(paths, 1):
5513 use_list = default is NO_DEFAULT and index == len(paths)
5514 result = _traverse_obj(obj, path, use_list)
5515 if result is not None:
5516 return result
5517
5518 return None if default is NO_DEFAULT else default
5519
5520
5521 def traverse_dict(dictn, keys, casesense=True):
5522 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5523 f'in a future version. Use "{__name__}.traverse_obj" instead')
5524 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5525
5526
5527 def get_first(obj, keys, **kwargs):
5528 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5529
5530
5531 def time_seconds(**kwargs):
5532 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5533 return t.timestamp()
5534
5535
5536 # create a JSON Web Signature (jws) with HS256 algorithm
5537 # the resulting format is in JWS Compact Serialization
5538 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5539 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5540 def jwt_encode_hs256(payload_data, key, headers={}):
5541 header_data = {
5542 'alg': 'HS256',
5543 'typ': 'JWT',
5544 }
5545 if headers:
5546 header_data.update(headers)
5547 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5548 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5549 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
5550 signature_b64 = base64.b64encode(h.digest())
5551 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5552 return token
5553
5554
5555 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5556 def jwt_decode_hs256(jwt):
5557 header_b64, payload_b64, signature_b64 = jwt.split('.')
5558 # add trailing ='s that may have been stripped, superfluous ='s are ignored
5559 payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}==='))
5560 return payload_data
5561
5562
5563 WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5564
5565
5566 @functools.cache
5567 def supports_terminal_sequences(stream):
5568 if compat_os_name == 'nt':
5569 if not WINDOWS_VT_MODE:
5570 return False
5571 elif not os.getenv('TERM'):
5572 return False
5573 try:
5574 return stream.isatty()
5575 except BaseException:
5576 return False
5577
5578
5579 def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5580 if get_windows_version() < (10, 0, 10586):
5581 return
5582 global WINDOWS_VT_MODE
5583 try:
5584 Popen.run('', shell=True)
5585 except Exception:
5586 return
5587
5588 WINDOWS_VT_MODE = True
5589 supports_terminal_sequences.cache_clear()
5590
5591
5592 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5593
5594
5595 def remove_terminal_sequences(string):
5596 return _terminal_sequences_re.sub('', string)
5597
5598
5599 def number_of_digits(number):
5600 return len('%d' % number)
5601
5602
5603 def join_nonempty(*values, delim='-', from_dict=None):
5604 if from_dict is not None:
5605 values = (traverse_obj(from_dict, variadic(v)) for v in values)
5606 return delim.join(map(str, filter(None, values)))
5607
5608
5609 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5610 """
5611 Find the largest format dimensions in terms of video width and, for each thumbnail:
5612 * Modify the URL: Match the width with the provided regex and replace with the former width
5613 * Update dimensions
5614
5615 This function is useful with video services that scale the provided thumbnails on demand
5616 """
5617 _keys = ('width', 'height')
5618 max_dimensions = max(
5619 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5620 default=(0, 0))
5621 if not max_dimensions[0]:
5622 return thumbnails
5623 return [
5624 merge_dicts(
5625 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5626 dict(zip(_keys, max_dimensions)), thumbnail)
5627 for thumbnail in thumbnails
5628 ]
5629
5630
5631 def parse_http_range(range):
5632 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5633 if not range:
5634 return None, None, None
5635 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5636 if not crg:
5637 return None, None, None
5638 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5639
5640
5641 def read_stdin(what):
5642 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5643 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5644 return sys.stdin
5645
5646
5647 def determine_file_encoding(data):
5648 """
5649 Detect the text encoding used
5650 @returns (encoding, bytes to skip)
5651 """
5652
5653 # BOM marks are given priority over declarations
5654 for bom, enc in BOMS:
5655 if data.startswith(bom):
5656 return enc, len(bom)
5657
5658 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5659 # We ignore the endianness to get a good enough match
5660 data = data.replace(b'\0', b'')
5661 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5662 return mobj.group(1).decode() if mobj else None, 0
5663
5664
5665 class Config:
5666 own_args = None
5667 parsed_args = None
5668 filename = None
5669 __initialized = False
5670
5671 def __init__(self, parser, label=None):
5672 self.parser, self.label = parser, label
5673 self._loaded_paths, self.configs = set(), []
5674
5675 def init(self, args=None, filename=None):
5676 assert not self.__initialized
5677 self.own_args, self.filename = args, filename
5678 return self.load_configs()
5679
5680 def load_configs(self):
5681 directory = ''
5682 if self.filename:
5683 location = os.path.realpath(self.filename)
5684 directory = os.path.dirname(location)
5685 if location in self._loaded_paths:
5686 return False
5687 self._loaded_paths.add(location)
5688
5689 self.__initialized = True
5690 opts, _ = self.parser.parse_known_args(self.own_args)
5691 self.parsed_args = self.own_args
5692 for location in opts.config_locations or []:
5693 if location == '-':
5694 if location in self._loaded_paths:
5695 continue
5696 self._loaded_paths.add(location)
5697 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5698 continue
5699 location = os.path.join(directory, expand_path(location))
5700 if os.path.isdir(location):
5701 location = os.path.join(location, 'yt-dlp.conf')
5702 if not os.path.exists(location):
5703 self.parser.error(f'config location {location} does not exist')
5704 self.append_config(self.read_file(location), location)
5705 return True
5706
5707 def __str__(self):
5708 label = join_nonempty(
5709 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5710 delim=' ')
5711 return join_nonempty(
5712 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5713 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5714 delim='\n')
5715
5716 @staticmethod
5717 def read_file(filename, default=[]):
5718 try:
5719 optionf = open(filename, 'rb')
5720 except OSError:
5721 return default # silently skip if file is not present
5722 try:
5723 enc, skip = determine_file_encoding(optionf.read(512))
5724 optionf.seek(skip, io.SEEK_SET)
5725 except OSError:
5726 enc = None # silently skip read errors
5727 try:
5728 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5729 contents = optionf.read().decode(enc or preferredencoding())
5730 res = shlex.split(contents, comments=True)
5731 except Exception as err:
5732 raise ValueError(f'Unable to parse "{filename}": {err}')
5733 finally:
5734 optionf.close()
5735 return res
5736
5737 @staticmethod
5738 def hide_login_info(opts):
5739 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5740 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5741
5742 def _scrub_eq(o):
5743 m = eqre.match(o)
5744 if m:
5745 return m.group('key') + '=PRIVATE'
5746 else:
5747 return o
5748
5749 opts = list(map(_scrub_eq, opts))
5750 for idx, opt in enumerate(opts):
5751 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5752 opts[idx + 1] = 'PRIVATE'
5753 return opts
5754
5755 def append_config(self, *args, label=None):
5756 config = type(self)(self.parser, label)
5757 config._loaded_paths = self._loaded_paths
5758 if config.init(*args):
5759 self.configs.append(config)
5760
5761 @property
5762 def all_args(self):
5763 for config in reversed(self.configs):
5764 yield from config.all_args
5765 yield from self.parsed_args or []
5766
5767 def parse_known_args(self, **kwargs):
5768 return self.parser.parse_known_args(self.all_args, **kwargs)
5769
5770 def parse_args(self):
5771 return self.parser.parse_args(self.all_args)
5772
5773
5774 class WebSocketsWrapper:
5775 """Wraps websockets module to use in non-async scopes"""
5776 pool = None
5777
5778 def __init__(self, url, headers=None, connect=True):
5779 self.loop = asyncio.new_event_loop()
5780 # XXX: "loop" is deprecated
5781 self.conn = websockets.connect(
5782 url, extra_headers=headers, ping_interval=None,
5783 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5784 if connect:
5785 self.__enter__()
5786 atexit.register(self.__exit__, None, None, None)
5787
5788 def __enter__(self):
5789 if not self.pool:
5790 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5791 return self
5792
5793 def send(self, *args):
5794 self.run_with_loop(self.pool.send(*args), self.loop)
5795
5796 def recv(self, *args):
5797 return self.run_with_loop(self.pool.recv(*args), self.loop)
5798
5799 def __exit__(self, type, value, traceback):
5800 try:
5801 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5802 finally:
5803 self.loop.close()
5804 self._cancel_all_tasks(self.loop)
5805
5806 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5807 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5808 @staticmethod
5809 def run_with_loop(main, loop):
5810 if not asyncio.iscoroutine(main):
5811 raise ValueError(f'a coroutine was expected, got {main!r}')
5812
5813 try:
5814 return loop.run_until_complete(main)
5815 finally:
5816 loop.run_until_complete(loop.shutdown_asyncgens())
5817 if hasattr(loop, 'shutdown_default_executor'):
5818 loop.run_until_complete(loop.shutdown_default_executor())
5819
5820 @staticmethod
5821 def _cancel_all_tasks(loop):
5822 to_cancel = asyncio.all_tasks(loop)
5823
5824 if not to_cancel:
5825 return
5826
5827 for task in to_cancel:
5828 task.cancel()
5829
5830 # XXX: "loop" is removed in python 3.10+
5831 loop.run_until_complete(
5832 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
5833
5834 for task in to_cancel:
5835 if task.cancelled():
5836 continue
5837 if task.exception() is not None:
5838 loop.call_exception_handler({
5839 'message': 'unhandled exception during asyncio.run() shutdown',
5840 'exception': task.exception(),
5841 'task': task,
5842 })
5843
5844
5845 def merge_headers(*dicts):
5846 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5847 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5848
5849
5850 def cached_method(f):
5851 """Cache a method"""
5852 signature = inspect.signature(f)
5853
5854 @functools.wraps(f)
5855 def wrapper(self, *args, **kwargs):
5856 bound_args = signature.bind(self, *args, **kwargs)
5857 bound_args.apply_defaults()
5858 key = tuple(bound_args.arguments.values())[1:]
5859
5860 cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {})
5861 if key not in cache:
5862 cache[key] = f(self, *args, **kwargs)
5863 return cache[key]
5864 return wrapper
5865
5866
5867 class classproperty:
5868 """property access for class methods with optional caching"""
5869 def __new__(cls, func=None, *args, **kwargs):
5870 if not func:
5871 return functools.partial(cls, *args, **kwargs)
5872 return super().__new__(cls)
5873
5874 def __init__(self, func, *, cache=False):
5875 functools.update_wrapper(self, func)
5876 self.func = func
5877 self._cache = {} if cache else None
5878
5879 def __get__(self, _, cls):
5880 if self._cache is None:
5881 return self.func(cls)
5882 elif cls not in self._cache:
5883 self._cache[cls] = self.func(cls)
5884 return self._cache[cls]
5885
5886
5887 class Namespace(types.SimpleNamespace):
5888 """Immutable namespace"""
5889
5890 def __iter__(self):
5891 return iter(self.__dict__.values())
5892
5893 @property
5894 def items_(self):
5895 return self.__dict__.items()
5896
5897
5898 MEDIA_EXTENSIONS = Namespace(
5899 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5900 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5901 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5902 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5903 thumbnails=('jpg', 'png', 'webp'),
5904 storyboards=('mhtml', ),
5905 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5906 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5907 )
5908 MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5909 MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5910
5911 KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5912
5913
5914 class RetryManager:
5915 """Usage:
5916 for retry in RetryManager(...):
5917 try:
5918 ...
5919 except SomeException as err:
5920 retry.error = err
5921 continue
5922 """
5923 attempt, _error = 0, None
5924
5925 def __init__(self, _retries, _error_callback, **kwargs):
5926 self.retries = _retries or 0
5927 self.error_callback = functools.partial(_error_callback, **kwargs)
5928
5929 def _should_retry(self):
5930 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5931
5932 @property
5933 def error(self):
5934 if self._error is NO_DEFAULT:
5935 return None
5936 return self._error
5937
5938 @error.setter
5939 def error(self, value):
5940 self._error = value
5941
5942 def __iter__(self):
5943 while self._should_retry():
5944 self.error = NO_DEFAULT
5945 self.attempt += 1
5946 yield self
5947 if self.error:
5948 self.error_callback(self.error, self.attempt, self.retries)
5949
5950 @staticmethod
5951 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5952 """Utility function for reporting retries"""
5953 if count > retries:
5954 if error:
5955 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5956 raise e
5957
5958 if not count:
5959 return warn(e)
5960 elif isinstance(e, ExtractorError):
5961 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
5962 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5963
5964 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5965 if delay:
5966 info(f'Sleeping {delay:.2f} seconds ...')
5967 time.sleep(delay)
5968
5969
5970 def make_archive_id(ie, video_id):
5971 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5972 return f'{ie_key.lower()} {video_id}'
5973
5974
5975 def truncate_string(s, left, right=0):
5976 assert left > 3 and right >= 0
5977 if s is None or len(s) <= left + right:
5978 return s
5979 return f'{s[:left-3]}...{s[-right:]}'
5980
5981
5982 def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5983 assert 'all' in alias_dict, '"all" alias is required'
5984 requested = list(start or [])
5985 for val in options:
5986 discard = val.startswith('-')
5987 if discard:
5988 val = val[1:]
5989
5990 if val in alias_dict:
5991 val = alias_dict[val] if not discard else [
5992 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5993 # NB: Do not allow regex in aliases for performance
5994 requested = orderedSet_from_options(val, alias_dict, start=requested)
5995 continue
5996
5997 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5998 else [val] if val in alias_dict['all'] else None)
5999 if current is None:
6000 raise ValueError(val)
6001
6002 if discard:
6003 for item in current:
6004 while item in requested:
6005 requested.remove(item)
6006 else:
6007 requested.extend(current)
6008
6009 return orderedSet(requested)
6010
6011
6012 class FormatSorter:
6013 regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$'
6014
6015 default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality',
6016 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec',
6017 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases
6018 ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr',
6019 'height', 'width', 'proto', 'vext', 'abr', 'aext',
6020 'fps', 'fs_approx', 'source', 'id')
6021
6022 settings = {
6023 'vcodec': {'type': 'ordered', 'regex': True,
6024 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
6025 'acodec': {'type': 'ordered', 'regex': True,
6026 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
6027 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
6028 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
6029 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
6030 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']},
6031 'vext': {'type': 'ordered', 'field': 'video_ext',
6032 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'),
6033 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')},
6034 'aext': {'type': 'ordered', 'field': 'audio_ext',
6035 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'),
6036 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')},
6037 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
6038 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple',
6039 'field': ('vcodec', 'acodec'),
6040 'function': lambda it: int(any(v != 'none' for v in it))},
6041 'ie_pref': {'priority': True, 'type': 'extractor'},
6042 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
6043 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
6044 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
6045 'quality': {'convert': 'float', 'default': -1},
6046 'filesize': {'convert': 'bytes'},
6047 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
6048 'id': {'convert': 'string', 'field': 'format_id'},
6049 'height': {'convert': 'float_none'},
6050 'width': {'convert': 'float_none'},
6051 'fps': {'convert': 'float_none'},
6052 'channels': {'convert': 'float_none', 'field': 'audio_channels'},
6053 'tbr': {'convert': 'float_none'},
6054 'vbr': {'convert': 'float_none'},
6055 'abr': {'convert': 'float_none'},
6056 'asr': {'convert': 'float_none'},
6057 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
6058
6059 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
6060 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
6061 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
6062 'ext': {'type': 'combined', 'field': ('vext', 'aext')},
6063 'res': {'type': 'multiple', 'field': ('height', 'width'),
6064 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))},
6065
6066 # Actual field names
6067 'format_id': {'type': 'alias', 'field': 'id'},
6068 'preference': {'type': 'alias', 'field': 'ie_pref'},
6069 'language_preference': {'type': 'alias', 'field': 'lang'},
6070 'source_preference': {'type': 'alias', 'field': 'source'},
6071 'protocol': {'type': 'alias', 'field': 'proto'},
6072 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'},
6073 'audio_channels': {'type': 'alias', 'field': 'channels'},
6074
6075 # Deprecated
6076 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True},
6077 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True},
6078 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True},
6079 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True},
6080 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True},
6081 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True},
6082 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True},
6083 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True},
6084 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True},
6085 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True},
6086 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True},
6087 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True},
6088 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True},
6089 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True},
6090 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6091 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True},
6092 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6093 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True},
6094 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6095 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True},
6096 }
6097
6098 def __init__(self, ydl, field_preference):
6099 self.ydl = ydl
6100 self._order = []
6101 self.evaluate_params(self.ydl.params, field_preference)
6102 if ydl.params.get('verbose'):
6103 self.print_verbose_info(self.ydl.write_debug)
6104
6105 def _get_field_setting(self, field, key):
6106 if field not in self.settings:
6107 if key in ('forced', 'priority'):
6108 return False
6109 self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is '
6110 'deprecated and may be removed in a future version')
6111 self.settings[field] = {}
6112 propObj = self.settings[field]
6113 if key not in propObj:
6114 type = propObj.get('type')
6115 if key == 'field':
6116 default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field
6117 elif key == 'convert':
6118 default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore'
6119 else:
6120 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None)
6121 propObj[key] = default
6122 return propObj[key]
6123
6124 def _resolve_field_value(self, field, value, convertNone=False):
6125 if value is None:
6126 if not convertNone:
6127 return None
6128 else:
6129 value = value.lower()
6130 conversion = self._get_field_setting(field, 'convert')
6131 if conversion == 'ignore':
6132 return None
6133 if conversion == 'string':
6134 return value
6135 elif conversion == 'float_none':
6136 return float_or_none(value)
6137 elif conversion == 'bytes':
6138 return parse_bytes(value)
6139 elif conversion == 'order':
6140 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order')
6141 use_regex = self._get_field_setting(field, 'regex')
6142 list_length = len(order_list)
6143 empty_pos = order_list.index('') if '' in order_list else list_length + 1
6144 if use_regex and value is not None:
6145 for i, regex in enumerate(order_list):
6146 if regex and re.match(regex, value):
6147 return list_length - i
6148 return list_length - empty_pos # not in list
6149 else: # not regex or value = None
6150 return list_length - (order_list.index(value) if value in order_list else empty_pos)
6151 else:
6152 if value.isnumeric():
6153 return float(value)
6154 else:
6155 self.settings[field]['convert'] = 'string'
6156 return value
6157
6158 def evaluate_params(self, params, sort_extractor):
6159 self._use_free_order = params.get('prefer_free_formats', False)
6160 self._sort_user = params.get('format_sort', [])
6161 self._sort_extractor = sort_extractor
6162
6163 def add_item(field, reverse, closest, limit_text):
6164 field = field.lower()
6165 if field in self._order:
6166 return
6167 self._order.append(field)
6168 limit = self._resolve_field_value(field, limit_text)
6169 data = {
6170 'reverse': reverse,
6171 'closest': False if limit is None else closest,
6172 'limit_text': limit_text,
6173 'limit': limit}
6174 if field in self.settings:
6175 self.settings[field].update(data)
6176 else:
6177 self.settings[field] = data
6178
6179 sort_list = (
6180 tuple(field for field in self.default if self._get_field_setting(field, 'forced'))
6181 + (tuple() if params.get('format_sort_force', False)
6182 else tuple(field for field in self.default if self._get_field_setting(field, 'priority')))
6183 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
6184
6185 for item in sort_list:
6186 match = re.match(self.regex, item)
6187 if match is None:
6188 raise ExtractorError('Invalid format sort string "%s" given by extractor' % item)
6189 field = match.group('field')
6190 if field is None:
6191 continue
6192 if self._get_field_setting(field, 'type') == 'alias':
6193 alias, field = field, self._get_field_setting(field, 'field')
6194 if self._get_field_setting(alias, 'deprecated'):
6195 self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may '
6196 f'be removed in a future version. Please use {field} instead')
6197 reverse = match.group('reverse') is not None
6198 closest = match.group('separator') == '~'
6199 limit_text = match.group('limit')
6200
6201 has_limit = limit_text is not None
6202 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined'
6203 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit')
6204
6205 fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,)
6206 limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple()
6207 limit_count = len(limits)
6208 for (i, f) in enumerate(fields):
6209 add_item(f, reverse, closest,
6210 limits[i] if i < limit_count
6211 else limits[0] if has_limit and not has_multiple_limits
6212 else None)
6213
6214 def print_verbose_info(self, write_debug):
6215 if self._sort_user:
6216 write_debug('Sort order given by user: %s' % ', '.join(self._sort_user))
6217 if self._sort_extractor:
6218 write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor))
6219 write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % (
6220 '+' if self._get_field_setting(field, 'reverse') else '', field,
6221 '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':',
6222 self._get_field_setting(field, 'limit_text'),
6223 self._get_field_setting(field, 'limit'))
6224 if self._get_field_setting(field, 'limit_text') is not None else '')
6225 for field in self._order if self._get_field_setting(field, 'visible')]))
6226
6227 def _calculate_field_preference_from_value(self, format, field, type, value):
6228 reverse = self._get_field_setting(field, 'reverse')
6229 closest = self._get_field_setting(field, 'closest')
6230 limit = self._get_field_setting(field, 'limit')
6231
6232 if type == 'extractor':
6233 maximum = self._get_field_setting(field, 'max')
6234 if value is None or (maximum is not None and value >= maximum):
6235 value = -1
6236 elif type == 'boolean':
6237 in_list = self._get_field_setting(field, 'in_list')
6238 not_in_list = self._get_field_setting(field, 'not_in_list')
6239 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
6240 elif type == 'ordered':
6241 value = self._resolve_field_value(field, value, True)
6242
6243 # try to convert to number
6244 val_num = float_or_none(value, default=self._get_field_setting(field, 'default'))
6245 is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None
6246 if is_num:
6247 value = val_num
6248
6249 return ((-10, 0) if value is None
6250 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
6251 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
6252 else (0, value, 0) if not reverse and (limit is None or value <= limit)
6253 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
6254 else (-1, value, 0))
6255
6256 def _calculate_field_preference(self, format, field):
6257 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
6258 get_value = lambda f: format.get(self._get_field_setting(f, 'field'))
6259 if type == 'multiple':
6260 type = 'field' # Only 'field' is allowed in multiple for now
6261 actual_fields = self._get_field_setting(field, 'field')
6262
6263 value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields)
6264 else:
6265 value = get_value(field)
6266 return self._calculate_field_preference_from_value(format, field, type, value)
6267
6268 def calculate_preference(self, format):
6269 # Determine missing protocol
6270 if not format.get('protocol'):
6271 format['protocol'] = determine_protocol(format)
6272
6273 # Determine missing ext
6274 if not format.get('ext') and 'url' in format:
6275 format['ext'] = determine_ext(format['url'])
6276 if format.get('vcodec') == 'none':
6277 format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none'
6278 format['video_ext'] = 'none'
6279 else:
6280 format['video_ext'] = format['ext']
6281 format['audio_ext'] = 'none'
6282 # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported?
6283 # format['preference'] = -1000
6284
6285 # Determine missing bitrates
6286 if format.get('tbr') is None:
6287 if format.get('vbr') is not None and format.get('abr') is not None:
6288 format['tbr'] = format.get('vbr', 0) + format.get('abr', 0)
6289 else:
6290 if format.get('vcodec') != 'none' and format.get('vbr') is None:
6291 format['vbr'] = format.get('tbr') - format.get('abr', 0)
6292 if format.get('acodec') != 'none' and format.get('abr') is None:
6293 format['abr'] = format.get('tbr') - format.get('vbr', 0)
6294
6295 return tuple(self._calculate_field_preference(format, field) for field in self._order)
6296
6297
6298 # Deprecated
6299 has_certifi = bool(certifi)
6300 has_websockets = bool(websockets)