]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[extractor/youtube] Support changing extraction language (#4470)
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
62e609ab 8import contextlib
c496ca96 9import datetime
0c265486 10import email.header
f8271158 11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
49fa4d9a
N
14import hashlib
15import hmac
ac668111 16import html.entities
17import html.parser
54007a45 18import http.client
19import http.cookiejar
019a94f7 20import importlib.util
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
989a01c2 43import unicodedata
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
6929b41a 50from .compat import functools # isort: split
8c25f81b 51from .compat import (
36e6f62c 52 compat_etree_fromstring,
51098426 53 compat_expanduser,
f8271158 54 compat_HTMLParseError,
efa97bdc 55 compat_os_name,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57)
ac668111 58from .dependencies import brotli, certifi, websockets, xattr
f8271158 59from .socks import ProxyType, sockssocket
71aff188 60
4644ac55 61
51fb4995
YCH
62def register_socks_protocols():
63 # "Register" SOCKS protocols
d5ae6bb5
YCH
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
69
70
468e2e92
FV
71# This is not clearly defined otherwise
72compiled_regex_type = type(re.compile(''))
73
f7a147e3
S
74
75def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
19b4c74d 78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
f7a147e3
S
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
4390d5ec 120SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122]
9b8ee23b 123if brotli:
4390d5ec 124 SUPPORTED_ENCODINGS.append('br')
125
3e669f36 126std_headers = {
f7a147e3 127 'User-Agent': random_user_agent(),
59ae15a5 128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 129 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 130 'Sec-Fetch-Mode': 'navigate',
3e669f36 131}
f427df17 132
5f6a1245 133
fb37eb25
S
134USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136}
137
138
bf42a990 139NO_DEFAULT = object()
7b2c3f47 140IDENTITY = lambda x: x
bf42a990 141
7105440c
YCH
142ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
f6717dec
S
146MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
3e4185c3
S
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 151}
a942d6cb 152
8f53dc44 153# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
154TIMEZONE_NAMES = {
155 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
156 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
157 'EST': -5, 'EDT': -4, # Eastern
158 'CST': -6, 'CDT': -5, # Central
159 'MST': -7, 'MDT': -6, # Mountain
160 'PST': -8, 'PDT': -7 # Pacific
161}
162
c587cbb7 163# needed for sanitizing filenames in restricted mode
c8827027 164ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
165 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
166 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 167
46f59e89
S
168DATE_FORMATS = (
169 '%d %B %Y',
170 '%d %b %Y',
171 '%B %d %Y',
cb655f34
S
172 '%B %dst %Y',
173 '%B %dnd %Y',
9d30c213 174 '%B %drd %Y',
cb655f34 175 '%B %dth %Y',
46f59e89 176 '%b %d %Y',
cb655f34
S
177 '%b %dst %Y',
178 '%b %dnd %Y',
9d30c213 179 '%b %drd %Y',
cb655f34 180 '%b %dth %Y',
46f59e89
S
181 '%b %dst %Y %I:%M',
182 '%b %dnd %Y %I:%M',
9d30c213 183 '%b %drd %Y %I:%M',
46f59e89
S
184 '%b %dth %Y %I:%M',
185 '%Y %m %d',
186 '%Y-%m-%d',
bccdbd22 187 '%Y.%m.%d.',
46f59e89 188 '%Y/%m/%d',
81c13222 189 '%Y/%m/%d %H:%M',
46f59e89 190 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
191 '%Y%m%d%H%M',
192 '%Y%m%d%H%M%S',
4f3fa23e 193 '%Y%m%d',
0c1c6f4b 194 '%Y-%m-%d %H:%M',
46f59e89
S
195 '%Y-%m-%d %H:%M:%S',
196 '%Y-%m-%d %H:%M:%S.%f',
5014558a 197 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
198 '%d.%m.%Y %H:%M',
199 '%d.%m.%Y %H.%M',
200 '%Y-%m-%dT%H:%M:%SZ',
201 '%Y-%m-%dT%H:%M:%S.%fZ',
202 '%Y-%m-%dT%H:%M:%S.%f0Z',
203 '%Y-%m-%dT%H:%M:%S',
204 '%Y-%m-%dT%H:%M:%S.%f',
205 '%Y-%m-%dT%H:%M',
c6eed6b8
S
206 '%b %d %Y at %H:%M',
207 '%b %d %Y at %H:%M:%S',
b555ae9b
S
208 '%B %d %Y at %H:%M',
209 '%B %d %Y at %H:%M:%S',
a63d9bd0 210 '%H:%M %d-%b-%Y',
46f59e89
S
211)
212
213DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
214DATE_FORMATS_DAY_FIRST.extend([
215 '%d-%m-%Y',
216 '%d.%m.%Y',
217 '%d.%m.%y',
218 '%d/%m/%Y',
219 '%d/%m/%y',
220 '%d/%m/%Y %H:%M:%S',
47304e07 221 '%d-%m-%Y %H:%M',
46f59e89
S
222])
223
224DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
225DATE_FORMATS_MONTH_FIRST.extend([
226 '%m-%d-%Y',
227 '%m.%d.%Y',
228 '%m/%d/%Y',
229 '%m/%d/%y',
230 '%m/%d/%Y %H:%M:%S',
231])
232
06b3fe29 233PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
ae61d108 234JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
06b3fe29 235
1d485a1a 236NUMBER_RE = r'\d+(?:\.\d+)?'
237
7105440c 238
0b9c08b4 239@functools.cache
d77c3dfd 240def preferredencoding():
59ae15a5 241 """Get preferred encoding.
d77c3dfd 242
59ae15a5
PH
243 Returns the best encoding scheme for the system, based on
244 locale.getpreferredencoding() and some further tweaks.
245 """
246 try:
247 pref = locale.getpreferredencoding()
28e614de 248 'TEST'.encode(pref)
70a1165b 249 except Exception:
59ae15a5 250 pref = 'UTF-8'
bae611f2 251
59ae15a5 252 return pref
d77c3dfd 253
f4bfd65f 254
181c8655 255def write_json_file(obj, fn):
1394646a 256 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 257
cfb0511d 258 tf = tempfile.NamedTemporaryFile(
259 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
260 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
261
262 try:
263 with tf:
45d86abe 264 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
265 if sys.platform == 'win32':
266 # Need to remove existing file on Windows, else os.rename raises
267 # WindowsError or FileExistsError.
19a03940 268 with contextlib.suppress(OSError):
1394646a 269 os.unlink(fn)
19a03940 270 with contextlib.suppress(OSError):
9cd5f54e
R
271 mask = os.umask(0)
272 os.umask(mask)
273 os.chmod(tf.name, 0o666 & ~mask)
181c8655 274 os.rename(tf.name, fn)
70a1165b 275 except Exception:
19a03940 276 with contextlib.suppress(OSError):
181c8655 277 os.remove(tf.name)
181c8655
PH
278 raise
279
280
cfb0511d 281def find_xpath_attr(node, xpath, key, val=None):
282 """ Find the xpath xpath[@key=val] """
283 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 284 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 285 return node.find(expr)
59ae56fa 286
d7e66d39
JMF
287# On python2.6 the xml.etree.ElementTree.Element methods don't support
288# the namespace parameter
5f6a1245
JW
289
290
d7e66d39
JMF
291def xpath_with_ns(path, ns_map):
292 components = [c.split(':') for c in path.split('/')]
293 replaced = []
294 for c in components:
295 if len(c) == 1:
296 replaced.append(c[0])
297 else:
298 ns, tag = c
299 replaced.append('{%s}%s' % (ns_map[ns], tag))
300 return '/'.join(replaced)
301
d77c3dfd 302
a41fb80c 303def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 304 def _find_xpath(xpath):
f9934b96 305 return node.find(xpath)
578c0745 306
14f25df2 307 if isinstance(xpath, str):
578c0745
S
308 n = _find_xpath(xpath)
309 else:
310 for xp in xpath:
311 n = _find_xpath(xp)
312 if n is not None:
313 break
d74bebd5 314
8e636da4 315 if n is None:
bf42a990
S
316 if default is not NO_DEFAULT:
317 return default
318 elif fatal:
bf0ff932
PH
319 name = xpath if name is None else name
320 raise ExtractorError('Could not find XML element %s' % name)
321 else:
322 return None
a41fb80c
S
323 return n
324
325
326def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
327 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
328 if n is None or n == default:
329 return n
330 if n.text is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = xpath if name is None else name
335 raise ExtractorError('Could not find XML element\'s text %s' % name)
336 else:
337 return None
338 return n.text
a41fb80c
S
339
340
341def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
342 n = find_xpath_attr(node, xpath, key)
343 if n is None:
344 if default is not NO_DEFAULT:
345 return default
346 elif fatal:
86e5f3ed 347 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
348 raise ExtractorError('Could not find XML attribute %s' % name)
349 else:
350 return None
351 return n.attrib[key]
bf0ff932
PH
352
353
c487cf00 354def get_element_by_id(id, html, **kwargs):
43e8fafd 355 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 356 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 357
12ea2f30 358
c487cf00 359def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 360 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 361 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
362
363
84c237fb 364def get_element_by_class(class_name, html):
2af12ad9
TC
365 """Return the content of the first tag with the specified class in the passed HTML document"""
366 retval = get_elements_by_class(class_name, html)
367 return retval[0] if retval else None
368
369
6f32a0b5
ZM
370def get_element_html_by_class(class_name, html):
371 """Return the html of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_html_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
c487cf00 376def get_element_by_attribute(attribute, value, html, **kwargs):
377 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
378 return retval[0] if retval else None
379
380
c487cf00 381def get_element_html_by_attribute(attribute, value, html, **kargs):
382 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
383 return retval[0] if retval else None
384
385
c487cf00 386def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
387 """Return the content of all tags with the specified class in the passed HTML document as a list"""
388 return get_elements_by_attribute(
64fa820c 389 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
390 html, escape_value=False)
391
392
6f32a0b5
ZM
393def get_elements_html_by_class(class_name, html):
394 """Return the html of all tags with the specified class in the passed HTML document as a list"""
395 return get_elements_html_by_attribute(
64fa820c 396 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
397 html, escape_value=False)
398
399
400def get_elements_by_attribute(*args, **kwargs):
43e8fafd 401 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
402 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
403
404
405def get_elements_html_by_attribute(*args, **kwargs):
406 """Return the html of the tag with the specified attribute in the passed HTML document"""
407 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
408
409
410def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
411 """
412 Return the text (content) and the html (whole) of the tag with the specified
413 attribute in the passed HTML document
414 """
9e6dd238 415
86e5f3ed 416 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 417
84c237fb
YCH
418 value = re.escape(value) if escape_value else value
419
86e5f3ed 420 partial_element_re = rf'''(?x)
6f32a0b5 421 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 422 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 423 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
424 '''
38285056 425
0254f162
ZM
426 for m in re.finditer(partial_element_re, html):
427 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 428
0254f162
ZM
429 yield (
430 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
431 whole
432 )
a921f407 433
c5229f39 434
ac668111 435class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
436 """
437 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
438 closing tag for the first opening tag it has encountered, and can be used
439 as a context manager
440 """
441
442 class HTMLBreakOnClosingTagException(Exception):
443 pass
444
445 def __init__(self):
446 self.tagstack = collections.deque()
ac668111 447 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
448
449 def __enter__(self):
450 return self
451
452 def __exit__(self, *_):
453 self.close()
454
455 def close(self):
456 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
457 # so data remains buffered; we no longer have any interest in it, thus
458 # override this method to discard it
459 pass
460
461 def handle_starttag(self, tag, _):
462 self.tagstack.append(tag)
463
464 def handle_endtag(self, tag):
465 if not self.tagstack:
466 raise compat_HTMLParseError('no tags in the stack')
467 while self.tagstack:
468 inner_tag = self.tagstack.pop()
469 if inner_tag == tag:
470 break
471 else:
472 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
473 if not self.tagstack:
474 raise self.HTMLBreakOnClosingTagException()
475
476
477def get_element_text_and_html_by_tag(tag, html):
478 """
479 For the first element with the specified tag in the passed HTML document
480 return its' content (text) and the whole element (html)
481 """
482 def find_or_raise(haystack, needle, exc):
483 try:
484 return haystack.index(needle)
485 except ValueError:
486 raise exc
487 closing_tag = f'</{tag}>'
488 whole_start = find_or_raise(
489 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
490 content_start = find_or_raise(
491 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
492 content_start += whole_start + 1
493 with HTMLBreakOnClosingTagParser() as parser:
494 parser.feed(html[whole_start:content_start])
495 if not parser.tagstack or parser.tagstack[0] != tag:
496 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
497 offset = content_start
498 while offset < len(html):
499 next_closing_tag_start = find_or_raise(
500 html[offset:], closing_tag,
501 compat_HTMLParseError(f'closing {tag} tag not found'))
502 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
503 try:
504 parser.feed(html[offset:offset + next_closing_tag_end])
505 offset += next_closing_tag_end
506 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
507 return html[content_start:offset + next_closing_tag_start], \
508 html[whole_start:offset + next_closing_tag_end]
509 raise compat_HTMLParseError('unexpected end of html')
510
511
ac668111 512class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 513 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 514
8bb56eee 515 def __init__(self):
c5229f39 516 self.attrs = {}
ac668111 517 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
518
519 def handle_starttag(self, tag, attrs):
520 self.attrs = dict(attrs)
521
c5229f39 522
ac668111 523class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
524 """HTML parser to gather the attributes for the elements of a list"""
525
526 def __init__(self):
ac668111 527 html.parser.HTMLParser.__init__(self)
73673ccf
FF
528 self.items = []
529 self._level = 0
530
531 def handle_starttag(self, tag, attrs):
532 if tag == 'li' and self._level == 0:
533 self.items.append(dict(attrs))
534 self._level += 1
535
536 def handle_endtag(self, tag):
537 self._level -= 1
538
539
8bb56eee
BF
540def extract_attributes(html_element):
541 """Given a string for an HTML element such as
542 <el
543 a="foo" B="bar" c="&98;az" d=boz
544 empty= noval entity="&amp;"
545 sq='"' dq="'"
546 >
547 Decode and return a dictionary of attributes.
548 {
549 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
550 'empty': '', 'noval': None, 'entity': '&',
551 'sq': '"', 'dq': '\''
552 }.
8bb56eee
BF
553 """
554 parser = HTMLAttributeParser()
19a03940 555 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
556 parser.feed(html_element)
557 parser.close()
8bb56eee 558 return parser.attrs
9e6dd238 559
c5229f39 560
73673ccf
FF
561def parse_list(webpage):
562 """Given a string for an series of HTML <li> elements,
563 return a dictionary of their attributes"""
564 parser = HTMLListAttrsParser()
565 parser.feed(webpage)
566 parser.close()
567 return parser.items
568
569
9e6dd238 570def clean_html(html):
59ae15a5 571 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
572
573 if html is None: # Convenience for sanitizing descriptions etc.
574 return html
575
49185227 576 html = re.sub(r'\s+', ' ', html)
577 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
578 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
579 # Strip html tags
580 html = re.sub('<.*?>', '', html)
581 # Replace html entities
582 html = unescapeHTML(html)
7decf895 583 return html.strip()
9e6dd238
FV
584
585
b7c47b74 586class LenientJSONDecoder(json.JSONDecoder):
587 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
588 self.transform_source, self.ignore_extra = transform_source, ignore_extra
589 super().__init__(*args, **kwargs)
590
591 def decode(self, s):
592 if self.transform_source:
593 s = self.transform_source(s)
594 if self.ignore_extra:
595 return self.raw_decode(s.lstrip())[0]
596 return super().decode(s)
597
598
d77c3dfd 599def sanitize_open(filename, open_mode):
59ae15a5
PH
600 """Try to open the given filename, and slightly tweak it if this fails.
601
602 Attempts to open the given filename. If this fails, it tries to change
603 the filename slightly, step by step, until it's either able to open it
604 or it fails and raises a final exception, like the standard open()
605 function.
606
607 It returns the tuple (stream, definitive_file_name).
608 """
0edb3e33 609 if filename == '-':
610 if sys.platform == 'win32':
611 import msvcrt
be5c1ae8 612
62b58c09 613 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 614 with contextlib.suppress(io.UnsupportedOperation):
615 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 616 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 617
0edb3e33 618 for attempt in range(2):
619 try:
620 try:
89737671 621 if sys.platform == 'win32':
b506289f 622 # FIXME: An exclusive lock also locks the file from being read.
623 # Since windows locks are mandatory, don't lock the file on windows (for now).
624 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 625 raise LockingUnsupportedError()
0edb3e33 626 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 627 except OSError:
0edb3e33 628 stream = open(filename, open_mode)
8a82af35 629 return stream, filename
86e5f3ed 630 except OSError as err:
0edb3e33 631 if attempt or err.errno in (errno.EACCES,):
632 raise
633 old_filename, filename = filename, sanitize_path(filename)
634 if old_filename == filename:
635 raise
d77c3dfd
FV
636
637
638def timeconvert(timestr):
59ae15a5
PH
639 """Convert RFC 2822 defined time string into system timestamp"""
640 timestamp = None
641 timetuple = email.utils.parsedate_tz(timestr)
642 if timetuple is not None:
643 timestamp = email.utils.mktime_tz(timetuple)
644 return timestamp
1c469a94 645
5f6a1245 646
5c3895ff 647def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 648 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 649 @param restricted Use a stricter subset of allowed characters
650 @param is_id Whether this is an ID that should be kept unchanged if possible.
651 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 652 """
5c3895ff 653 if s == '':
654 return ''
655
59ae15a5 656 def replace_insane(char):
c587cbb7
AT
657 if restricted and char in ACCENT_CHARS:
658 return ACCENT_CHARS[char]
91dd88b9 659 elif not restricted and char == '\n':
5c3895ff 660 return '\0 '
989a01c2 661 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
662 # Replace with their full-width unicode counterparts
663 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 664 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
665 return ''
666 elif char == '"':
667 return '' if restricted else '\''
668 elif char == ':':
5c3895ff 669 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 670 elif char in '\\/|*<>':
5c3895ff 671 return '\0_'
672 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
673 return '\0_'
59ae15a5
PH
674 return char
675
989a01c2 676 if restricted and is_id is NO_DEFAULT:
677 s = unicodedata.normalize('NFKC', s)
5c3895ff 678 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 679 result = ''.join(map(replace_insane, s))
5c3895ff 680 if is_id is NO_DEFAULT:
ae61d108 681 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
682 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 683 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
684 result = result.replace('\0', '') or '_'
685
796173d0
PH
686 if not is_id:
687 while '__' in result:
688 result = result.replace('__', '_')
689 result = result.strip('_')
690 # Common case of "Foreign band name - English song title"
691 if restricted and result.startswith('-_'):
692 result = result[2:]
5a42414b
PH
693 if result.startswith('-'):
694 result = '_' + result[len('-'):]
a7440261 695 result = result.lstrip('.')
796173d0
PH
696 if not result:
697 result = '_'
59ae15a5 698 return result
d77c3dfd 699
5f6a1245 700
c2934512 701def sanitize_path(s, force=False):
a2aaf4db 702 """Sanitizes and normalizes path on Windows"""
c2934512 703 if sys.platform == 'win32':
c4218ac3 704 force = False
c2934512 705 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 706 elif force:
707 drive_or_unc = ''
708 else:
a2aaf4db 709 return s
c2934512 710
be531ef1
S
711 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
712 if drive_or_unc:
a2aaf4db
S
713 norm_path.pop(0)
714 sanitized_path = [
ec85ded8 715 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 716 for path_part in norm_path]
be531ef1
S
717 if drive_or_unc:
718 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 719 elif force and s and s[0] == os.path.sep:
c4218ac3 720 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
721 return os.path.join(*sanitized_path)
722
723
8f97a15d 724def sanitize_url(url, *, scheme='http'):
befa4708
S
725 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
726 # the number of unwanted failures due to missing protocol
21633673 727 if url is None:
728 return
729 elif url.startswith('//'):
8f97a15d 730 return f'{scheme}:{url}'
befa4708
S
731 # Fix some common typos seen so far
732 COMMON_TYPOS = (
067aa17e 733 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
734 (r'^httpss://', r'https://'),
735 # https://bx1.be/lives/direct-tv/
736 (r'^rmtp([es]?)://', r'rtmp\1://'),
737 )
738 for mistake, fixup in COMMON_TYPOS:
739 if re.match(mistake, url):
740 return re.sub(mistake, fixup, url)
bc6b9bcd 741 return url
17bcc626
S
742
743
5435dcf9 744def extract_basic_auth(url):
14f25df2 745 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
746 if parts.username is None:
747 return url, None
14f25df2 748 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
749 parts.hostname if parts.port is None
750 else '%s:%d' % (parts.hostname, parts.port))))
751 auth_payload = base64.b64encode(
0f06bcd7 752 ('%s:%s' % (parts.username, parts.password or '')).encode())
753 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
754
755
67dda517 756def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 757 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
758 if auth_header is not None:
759 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
760 headers['Authorization'] = auth_header
ac668111 761 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
762
763
51098426
S
764def expand_path(s):
765 """Expand shell variables and ~"""
766 return os.path.expandvars(compat_expanduser(s))
767
768
7e9a6125 769def orderedSet(iterable, *, lazy=False):
770 """Remove all duplicates from the input iterable"""
771 def _iter():
772 seen = [] # Do not use set since the items can be unhashable
773 for x in iterable:
774 if x not in seen:
775 seen.append(x)
776 yield x
777
778 return _iter() if lazy else list(_iter())
d77c3dfd 779
912b38b4 780
55b2f099 781def _htmlentity_transform(entity_with_semicolon):
4e408e47 782 """Transforms an HTML entity to a character."""
55b2f099
YCH
783 entity = entity_with_semicolon[:-1]
784
4e408e47 785 # Known non-numeric HTML entity
ac668111 786 if entity in html.entities.name2codepoint:
787 return chr(html.entities.name2codepoint[entity])
4e408e47 788
62b58c09
L
789 # TODO: HTML5 allows entities without a semicolon.
790 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 791 if entity_with_semicolon in html.entities.html5:
792 return html.entities.html5[entity_with_semicolon]
55b2f099 793
91757b0f 794 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
795 if mobj is not None:
796 numstr = mobj.group(1)
28e614de 797 if numstr.startswith('x'):
4e408e47 798 base = 16
28e614de 799 numstr = '0%s' % numstr
4e408e47
PH
800 else:
801 base = 10
067aa17e 802 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 803 with contextlib.suppress(ValueError):
ac668111 804 return chr(int(numstr, base))
4e408e47
PH
805
806 # Unknown entity in name, return its literal representation
7a3f0c00 807 return '&%s;' % entity
4e408e47
PH
808
809
d77c3dfd 810def unescapeHTML(s):
912b38b4
PH
811 if s is None:
812 return None
19a03940 813 assert isinstance(s, str)
d77c3dfd 814
4e408e47 815 return re.sub(
95f3f7c2 816 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 817
8bf48f23 818
cdb19aa4 819def escapeHTML(text):
820 return (
821 text
822 .replace('&', '&amp;')
823 .replace('<', '&lt;')
824 .replace('>', '&gt;')
825 .replace('"', '&quot;')
826 .replace("'", '&#39;')
827 )
828
829
f5b1bca9 830def process_communicate_or_kill(p, *args, **kwargs):
da4db748 831 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
832 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 833 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 834
835
d3c93ec2 836class Popen(subprocess.Popen):
837 if sys.platform == 'win32':
838 _startupinfo = subprocess.STARTUPINFO()
839 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
840 else:
841 _startupinfo = None
842
82ea226c
L
843 @staticmethod
844 def _fix_pyinstaller_ld_path(env):
845 """Restore LD_LIBRARY_PATH when using PyInstaller
846 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
847 https://github.com/yt-dlp/yt-dlp/issues/4573
848 """
849 if not hasattr(sys, '_MEIPASS'):
850 return
851
852 def _fix(key):
853 orig = env.get(f'{key}_ORIG')
854 if orig is None:
855 env.pop(key, None)
856 else:
857 env[key] = orig
858
859 _fix('LD_LIBRARY_PATH') # Linux
860 _fix('DYLD_LIBRARY_PATH') # macOS
861
862 def __init__(self, *args, env=None, text=False, **kwargs):
863 if env is None:
864 env = os.environ.copy()
865 self._fix_pyinstaller_ld_path(env)
866
f0c9fb96 867 if text is True:
868 kwargs['universal_newlines'] = True # For 3.6 compatibility
869 kwargs.setdefault('encoding', 'utf-8')
870 kwargs.setdefault('errors', 'replace')
82ea226c 871 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 872
873 def communicate_or_kill(self, *args, **kwargs):
8a82af35 874 try:
875 return self.communicate(*args, **kwargs)
876 except BaseException: # Including KeyboardInterrupt
f0c9fb96 877 self.kill(timeout=None)
8a82af35 878 raise
d3c93ec2 879
f0c9fb96 880 def kill(self, *, timeout=0):
881 super().kill()
882 if timeout != 0:
883 self.wait(timeout=timeout)
884
885 @classmethod
992dc6b4 886 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 887 with cls(*args, **kwargs) as proc:
992dc6b4 888 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
f0c9fb96 889 return stdout or '', stderr or '', proc.returncode
890
d3c93ec2 891
aa49acd1
S
892def get_subprocess_encoding():
893 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
894 # For subprocess calls, encode with locale encoding
895 # Refer to http://stackoverflow.com/a/9951851/35070
896 encoding = preferredencoding()
897 else:
898 encoding = sys.getfilesystemencoding()
899 if encoding is None:
900 encoding = 'utf-8'
901 return encoding
902
903
8bf48f23 904def encodeFilename(s, for_subprocess=False):
19a03940 905 assert isinstance(s, str)
cfb0511d 906 return s
aa49acd1
S
907
908
909def decodeFilename(b, for_subprocess=False):
cfb0511d 910 return b
8bf48f23 911
f07b74fc
PH
912
913def encodeArgument(s):
cfb0511d 914 # Legacy code that uses byte strings
915 # Uncomment the following line after fixing all post processors
14f25df2 916 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 917 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
918
919
aa49acd1 920def decodeArgument(b):
cfb0511d 921 return b
aa49acd1
S
922
923
8271226a
PH
924def decodeOption(optval):
925 if optval is None:
926 return optval
927 if isinstance(optval, bytes):
928 optval = optval.decode(preferredencoding())
929
14f25df2 930 assert isinstance(optval, str)
8271226a 931 return optval
1c256f70 932
5f6a1245 933
aa7785f8 934_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
935
936
937def timetuple_from_msec(msec):
938 secs, msec = divmod(msec, 1000)
939 mins, secs = divmod(secs, 60)
940 hrs, mins = divmod(mins, 60)
941 return _timetuple(hrs, mins, secs, msec)
942
943
cdb19aa4 944def formatSeconds(secs, delim=':', msec=False):
aa7785f8 945 time = timetuple_from_msec(secs * 1000)
946 if time.hours:
947 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
948 elif time.minutes:
949 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 950 else:
aa7785f8 951 ret = '%d' % time.seconds
952 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 953
a0ddb8a2 954
77562778 955def _ssl_load_windows_store_certs(ssl_context, storename):
956 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
957 try:
958 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
959 if encoding == 'x509_asn' and (
960 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
961 except PermissionError:
962 return
963 for cert in certs:
19a03940 964 with contextlib.suppress(ssl.SSLError):
77562778 965 ssl_context.load_verify_locations(cadata=cert)
a2366922 966
77562778 967
968def make_HTTPS_handler(params, **kwargs):
969 opts_check_certificate = not params.get('nocheckcertificate')
970 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
971 context.check_hostname = opts_check_certificate
f81c62a6 972 if params.get('legacyserverconnect'):
973 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 974 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
975 context.set_ciphers('DEFAULT')
8a82af35 976
77562778 977 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
978 if opts_check_certificate:
d5820461 979 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
980 context.load_verify_locations(cafile=certifi.where())
168bbc4f 981 else:
982 try:
983 context.load_default_certs()
984 # Work around the issue in load_default_certs when there are bad certificates. See:
985 # https://github.com/yt-dlp/yt-dlp/issues/1060,
986 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
987 except ssl.SSLError:
988 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
989 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
990 for storename in ('CA', 'ROOT'):
991 _ssl_load_windows_store_certs(context, storename)
992 context.set_default_verify_paths()
8a82af35 993
bb58c9ed 994 client_certfile = params.get('client_certificate')
995 if client_certfile:
996 try:
997 context.load_cert_chain(
998 client_certfile, keyfile=params.get('client_certificate_key'),
999 password=params.get('client_certificate_password'))
1000 except ssl.SSLError:
1001 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1002
1003 # Some servers may reject requests if ALPN extension is not sent. See:
1004 # https://github.com/python/cpython/issues/85140
1005 # https://github.com/yt-dlp/yt-dlp/issues/3878
1006 with contextlib.suppress(NotImplementedError):
1007 context.set_alpn_protocols(['http/1.1'])
1008
77562778 1009 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1010
732ea2f0 1011
5873d4cc 1012def bug_reports_message(before=';'):
57e0f077 1013 from .update import REPOSITORY
1014
1015 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1016 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1017
1018 before = before.rstrip()
1019 if not before or before.endswith(('.', '!', '?')):
1020 msg = msg[0].title() + msg[1:]
1021
1022 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1023
1024
bf5b9d85
PM
1025class YoutubeDLError(Exception):
1026 """Base exception for YoutubeDL errors."""
aa9369a2 1027 msg = None
1028
1029 def __init__(self, msg=None):
1030 if msg is not None:
1031 self.msg = msg
1032 elif self.msg is None:
1033 self.msg = type(self).__name__
1034 super().__init__(self.msg)
bf5b9d85
PM
1035
1036
ac668111 1037network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1038if hasattr(ssl, 'CertificateError'):
1039 network_exceptions.append(ssl.CertificateError)
1040network_exceptions = tuple(network_exceptions)
1041
1042
bf5b9d85 1043class ExtractorError(YoutubeDLError):
1c256f70 1044 """Error during info extraction."""
5f6a1245 1045
1151c407 1046 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1047 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1048 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1049 """
3158150c 1050 if sys.exc_info()[0] in network_exceptions:
9a82b238 1051 expected = True
d5979c5d 1052
7265a219 1053 self.orig_msg = str(msg)
1c256f70 1054 self.traceback = tb
1151c407 1055 self.expected = expected
2eabb802 1056 self.cause = cause
d11271dd 1057 self.video_id = video_id
1151c407 1058 self.ie = ie
1059 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1060 if isinstance(self.exc_info[1], ExtractorError):
1061 self.exc_info = self.exc_info[1].exc_info
1151c407 1062
86e5f3ed 1063 super().__init__(''.join((
a70635b8 1064 format_field(ie, None, '[%s] '),
1065 format_field(video_id, None, '%s: '),
7265a219 1066 msg,
a70635b8 1067 format_field(cause, None, ' (caused by %r)'),
1151c407 1068 '' if expected else bug_reports_message())))
1c256f70 1069
01951dda 1070 def format_traceback(self):
497d2fab 1071 return join_nonempty(
1072 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1073 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1074 delim='\n') or None
01951dda 1075
1c256f70 1076
416c7fcb
PH
1077class UnsupportedError(ExtractorError):
1078 def __init__(self, url):
86e5f3ed 1079 super().__init__(
416c7fcb
PH
1080 'Unsupported URL: %s' % url, expected=True)
1081 self.url = url
1082
1083
55b3e45b
JMF
1084class RegexNotFoundError(ExtractorError):
1085 """Error when a regex didn't match"""
1086 pass
1087
1088
773f291d
S
1089class GeoRestrictedError(ExtractorError):
1090 """Geographic restriction Error exception.
1091
1092 This exception may be thrown when a video is not available from your
1093 geographic location due to geographic restrictions imposed by a website.
1094 """
b6e0c7d2 1095
0db3bae8 1096 def __init__(self, msg, countries=None, **kwargs):
1097 kwargs['expected'] = True
86e5f3ed 1098 super().__init__(msg, **kwargs)
773f291d
S
1099 self.countries = countries
1100
1101
693f0600 1102class UserNotLive(ExtractorError):
1103 """Error when a channel/user is not live"""
1104
1105 def __init__(self, msg=None, **kwargs):
1106 kwargs['expected'] = True
1107 super().__init__(msg or 'The channel is not currently live', **kwargs)
1108
1109
bf5b9d85 1110class DownloadError(YoutubeDLError):
59ae15a5 1111 """Download Error exception.
d77c3dfd 1112
59ae15a5
PH
1113 This exception may be thrown by FileDownloader objects if they are not
1114 configured to continue on errors. They will contain the appropriate
1115 error message.
1116 """
5f6a1245 1117
8cc83b8d
FV
1118 def __init__(self, msg, exc_info=None):
1119 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1120 super().__init__(msg)
8cc83b8d 1121 self.exc_info = exc_info
d77c3dfd
FV
1122
1123
498f5606 1124class EntryNotInPlaylist(YoutubeDLError):
1125 """Entry not in playlist exception.
1126
1127 This exception will be thrown by YoutubeDL when a requested entry
1128 is not found in the playlist info_dict
1129 """
aa9369a2 1130 msg = 'Entry not found in info'
498f5606 1131
1132
bf5b9d85 1133class SameFileError(YoutubeDLError):
59ae15a5 1134 """Same File exception.
d77c3dfd 1135
59ae15a5
PH
1136 This exception will be thrown by FileDownloader objects if they detect
1137 multiple files would have to be downloaded to the same file on disk.
1138 """
aa9369a2 1139 msg = 'Fixed output name but more than one file to download'
1140
1141 def __init__(self, filename=None):
1142 if filename is not None:
1143 self.msg += f': {filename}'
1144 super().__init__(self.msg)
d77c3dfd
FV
1145
1146
bf5b9d85 1147class PostProcessingError(YoutubeDLError):
59ae15a5 1148 """Post Processing exception.
d77c3dfd 1149
59ae15a5
PH
1150 This exception may be raised by PostProcessor's .run() method to
1151 indicate an error in the postprocessing task.
1152 """
5f6a1245 1153
5f6a1245 1154
48f79687 1155class DownloadCancelled(YoutubeDLError):
1156 """ Exception raised when the download queue should be interrupted """
1157 msg = 'The download was cancelled'
8b0d7497 1158
8b0d7497 1159
48f79687 1160class ExistingVideoReached(DownloadCancelled):
1161 """ --break-on-existing triggered """
1162 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1163
48f79687 1164
1165class RejectedVideoReached(DownloadCancelled):
1166 """ --break-on-reject triggered """
1167 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1168
1169
48f79687 1170class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1171 """ --max-downloads limit has been reached. """
48f79687 1172 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1173
1174
f2ebc5c7 1175class ReExtractInfo(YoutubeDLError):
1176 """ Video info needs to be re-extracted. """
1177
1178 def __init__(self, msg, expected=False):
1179 super().__init__(msg)
1180 self.expected = expected
1181
1182
1183class ThrottledDownload(ReExtractInfo):
48f79687 1184 """ Download speed below --throttled-rate. """
aa9369a2 1185 msg = 'The download speed is below throttle limit'
d77c3dfd 1186
43b22906 1187 def __init__(self):
1188 super().__init__(self.msg, expected=False)
f2ebc5c7 1189
d77c3dfd 1190
bf5b9d85 1191class UnavailableVideoError(YoutubeDLError):
59ae15a5 1192 """Unavailable Format exception.
d77c3dfd 1193
59ae15a5
PH
1194 This exception will be thrown when a video is requested
1195 in a format that is not available for that video.
1196 """
aa9369a2 1197 msg = 'Unable to download video'
1198
1199 def __init__(self, err=None):
1200 if err is not None:
1201 self.msg += f': {err}'
1202 super().__init__(self.msg)
d77c3dfd
FV
1203
1204
bf5b9d85 1205class ContentTooShortError(YoutubeDLError):
59ae15a5 1206 """Content Too Short exception.
d77c3dfd 1207
59ae15a5
PH
1208 This exception may be raised by FileDownloader objects when a file they
1209 download is too small for what the server announced first, indicating
1210 the connection was probably interrupted.
1211 """
d77c3dfd 1212
59ae15a5 1213 def __init__(self, downloaded, expected):
86e5f3ed 1214 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1215 # Both in bytes
59ae15a5
PH
1216 self.downloaded = downloaded
1217 self.expected = expected
d77c3dfd 1218
5f6a1245 1219
bf5b9d85 1220class XAttrMetadataError(YoutubeDLError):
efa97bdc 1221 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1222 super().__init__(msg)
efa97bdc 1223 self.code = code
bd264412 1224 self.msg = msg
efa97bdc
YCH
1225
1226 # Parsing code and msg
3089bc74 1227 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1228 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1229 self.reason = 'NO_SPACE'
1230 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1231 self.reason = 'VALUE_TOO_LONG'
1232 else:
1233 self.reason = 'NOT_SUPPORTED'
1234
1235
bf5b9d85 1236class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1237 pass
1238
1239
c5a59d93 1240def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1241 hc = http_class(*args, **kwargs)
be4a824d 1242 source_address = ydl_handler._params.get('source_address')
8959018a 1243
be4a824d 1244 if source_address is not None:
8959018a
AU
1245 # This is to workaround _create_connection() from socket where it will try all
1246 # address data from getaddrinfo() including IPv6. This filters the result from
1247 # getaddrinfo() based on the source_address value.
1248 # This is based on the cpython socket.create_connection() function.
1249 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1250 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1251 host, port = address
1252 err = None
1253 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1254 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1255 ip_addrs = [addr for addr in addrs if addr[0] == af]
1256 if addrs and not ip_addrs:
1257 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1258 raise OSError(
9e21e6d9
S
1259 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1260 % (ip_version, source_address[0]))
8959018a
AU
1261 for res in ip_addrs:
1262 af, socktype, proto, canonname, sa = res
1263 sock = None
1264 try:
1265 sock = socket.socket(af, socktype, proto)
1266 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1267 sock.settimeout(timeout)
1268 sock.bind(source_address)
1269 sock.connect(sa)
1270 err = None # Explicitly break reference cycle
1271 return sock
86e5f3ed 1272 except OSError as _:
8959018a
AU
1273 err = _
1274 if sock is not None:
1275 sock.close()
1276 if err is not None:
1277 raise err
1278 else:
86e5f3ed 1279 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1280 if hasattr(hc, '_create_connection'):
1281 hc._create_connection = _create_connection
cfb0511d 1282 hc.source_address = (source_address, 0)
be4a824d
PH
1283
1284 return hc
1285
1286
87f0e62d 1287def handle_youtubedl_headers(headers):
992fc9d6
YCH
1288 filtered_headers = headers
1289
1290 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1291 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1292 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1293
992fc9d6 1294 return filtered_headers
87f0e62d
YCH
1295
1296
ac668111 1297class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1298 """Handler for HTTP requests and responses.
1299
1300 This class, when installed with an OpenerDirector, automatically adds
1301 the standard headers to every HTTP request and handles gzipped and
1302 deflated responses from web servers. If compression is to be avoided in
1303 a particular request, the original request in the program code only has
0424ec30 1304 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1305 removed before making the real request.
1306
1307 Part of this code was copied from:
1308
1309 http://techknack.net/python-urllib2-handlers/
1310
1311 Andrew Rowls, the author of that code, agreed to release it to the
1312 public domain.
1313 """
1314
be4a824d 1315 def __init__(self, params, *args, **kwargs):
ac668111 1316 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1317 self._params = params
1318
1319 def http_open(self, req):
ac668111 1320 conn_class = http.client.HTTPConnection
71aff188
YCH
1321
1322 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1323 if socks_proxy:
1324 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1325 del req.headers['Ytdl-socks-proxy']
1326
be4a824d 1327 return self.do_open(functools.partial(
71aff188 1328 _create_http_connection, self, conn_class, False),
be4a824d
PH
1329 req)
1330
59ae15a5
PH
1331 @staticmethod
1332 def deflate(data):
fc2119f2 1333 if not data:
1334 return data
59ae15a5
PH
1335 try:
1336 return zlib.decompress(data, -zlib.MAX_WBITS)
1337 except zlib.error:
1338 return zlib.decompress(data)
1339
4390d5ec 1340 @staticmethod
1341 def brotli(data):
1342 if not data:
1343 return data
9b8ee23b 1344 return brotli.decompress(data)
4390d5ec 1345
acebc9cd 1346 def http_request(self, req):
51f267d9
S
1347 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1348 # always respected by websites, some tend to give out URLs with non percent-encoded
1349 # non-ASCII characters (see telemb.py, ard.py [#3412])
1350 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1351 # To work around aforementioned issue we will replace request's original URL with
1352 # percent-encoded one
1353 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1354 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1355 url = req.get_full_url()
1356 url_escaped = escape_url(url)
1357
1358 # Substitute URL if any change after escaping
1359 if url != url_escaped:
15d260eb 1360 req = update_Request(req, url=url_escaped)
51f267d9 1361
8b7539d2 1362 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1363 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1364 # The dict keys are capitalized because of this bug by urllib
1365 if h.capitalize() not in req.headers:
33ac271b 1366 req.add_header(h, v)
87f0e62d 1367
af14914b 1368 if 'Accept-encoding' not in req.headers:
1369 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1370
87f0e62d 1371 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1372
379a4f16 1373 return super().do_request_(req)
59ae15a5 1374
acebc9cd 1375 def http_response(self, req, resp):
59ae15a5
PH
1376 old_resp = resp
1377 # gzip
1378 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1379 content = resp.read()
1380 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1381 try:
1382 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1383 except OSError as original_ioerror:
aa3e9507
PH
1384 # There may be junk add the end of the file
1385 # See http://stackoverflow.com/q/4928560/35070 for details
1386 for i in range(1, 1024):
1387 try:
1388 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1389 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1390 except OSError:
aa3e9507
PH
1391 continue
1392 break
1393 else:
1394 raise original_ioerror
ac668111 1395 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1396 resp.msg = old_resp.msg
c047270c 1397 del resp.headers['Content-encoding']
59ae15a5
PH
1398 # deflate
1399 if resp.headers.get('Content-encoding', '') == 'deflate':
1400 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1401 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1402 resp.msg = old_resp.msg
c047270c 1403 del resp.headers['Content-encoding']
4390d5ec 1404 # brotli
1405 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1406 resp = urllib.request.addinfourl(
4390d5ec 1407 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1408 resp.msg = old_resp.msg
1409 del resp.headers['Content-encoding']
ad729172 1410 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1411 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1412 if 300 <= resp.code < 400:
1413 location = resp.headers.get('Location')
1414 if location:
1415 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1416 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1417 location_escaped = escape_url(location)
1418 if location != location_escaped:
1419 del resp.headers['Location']
1420 resp.headers['Location'] = location_escaped
59ae15a5 1421 return resp
0f8d03f8 1422
acebc9cd
PH
1423 https_request = http_request
1424 https_response = http_response
bf50b038 1425
5de90176 1426
71aff188
YCH
1427def make_socks_conn_class(base_class, socks_proxy):
1428 assert issubclass(base_class, (
ac668111 1429 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1430
14f25df2 1431 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1432 if url_components.scheme.lower() == 'socks5':
1433 socks_type = ProxyType.SOCKS5
1434 elif url_components.scheme.lower() in ('socks', 'socks4'):
1435 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1436 elif url_components.scheme.lower() == 'socks4a':
1437 socks_type = ProxyType.SOCKS4A
71aff188 1438
cdd94c2e
YCH
1439 def unquote_if_non_empty(s):
1440 if not s:
1441 return s
ac668111 1442 return urllib.parse.unquote_plus(s)
cdd94c2e 1443
71aff188
YCH
1444 proxy_args = (
1445 socks_type,
1446 url_components.hostname, url_components.port or 1080,
1447 True, # Remote DNS
cdd94c2e
YCH
1448 unquote_if_non_empty(url_components.username),
1449 unquote_if_non_empty(url_components.password),
71aff188
YCH
1450 )
1451
1452 class SocksConnection(base_class):
1453 def connect(self):
1454 self.sock = sockssocket()
1455 self.sock.setproxy(*proxy_args)
19a03940 1456 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1457 self.sock.settimeout(self.timeout)
1458 self.sock.connect((self.host, self.port))
1459
ac668111 1460 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1461 if hasattr(self, '_context'): # Python > 2.6
1462 self.sock = self._context.wrap_socket(
1463 self.sock, server_hostname=self.host)
1464 else:
1465 self.sock = ssl.wrap_socket(self.sock)
1466
1467 return SocksConnection
1468
1469
ac668111 1470class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1471 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1472 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1473 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1474 self._params = params
1475
1476 def https_open(self, req):
4f264c02 1477 kwargs = {}
71aff188
YCH
1478 conn_class = self._https_conn_class
1479
4f264c02
JMF
1480 if hasattr(self, '_context'): # python > 2.6
1481 kwargs['context'] = self._context
1482 if hasattr(self, '_check_hostname'): # python 3.x
1483 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1484
1485 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1486 if socks_proxy:
1487 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1488 del req.headers['Ytdl-socks-proxy']
1489
4f28b537 1490 try:
1491 return self.do_open(
1492 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1493 except urllib.error.URLError as e:
1494 if (isinstance(e.reason, ssl.SSLError)
1495 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1496 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1497 raise
be4a824d
PH
1498
1499
ac668111 1500class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1501 """
1502 See [1] for cookie file format.
1503
1504 1. https://curl.haxx.se/docs/http-cookies.html
1505 """
e7e62441 1506 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1507 _ENTRY_LEN = 7
1508 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1509# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1510
1511'''
1512 _CookieFileEntry = collections.namedtuple(
1513 'CookieFileEntry',
1514 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1515
d76fa1f3 1516 def __init__(self, filename=None, *args, **kwargs):
1517 super().__init__(None, *args, **kwargs)
1518 if self.is_path(filename):
1519 filename = os.fspath(filename)
1520 self.filename = filename
1521
24146491 1522 @staticmethod
1523 def _true_or_false(cndn):
1524 return 'TRUE' if cndn else 'FALSE'
1525
d76fa1f3 1526 @staticmethod
1527 def is_path(file):
1528 return isinstance(file, (str, bytes, os.PathLike))
1529
1530 @contextlib.contextmanager
1531 def open(self, file, *, write=False):
1532 if self.is_path(file):
1533 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1534 yield f
1535 else:
1536 if write:
1537 file.truncate(0)
1538 yield file
1539
24146491 1540 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1541 now = time.time()
1542 for cookie in self:
1543 if (not ignore_discard and cookie.discard
1544 or not ignore_expires and cookie.is_expired(now)):
1545 continue
1546 name, value = cookie.name, cookie.value
1547 if value is None:
1548 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1549 # with no name, whereas http.cookiejar regards it as a
1550 # cookie with no value.
1551 name, value = '', name
1552 f.write('%s\n' % '\t'.join((
1553 cookie.domain,
1554 self._true_or_false(cookie.domain.startswith('.')),
1555 cookie.path,
1556 self._true_or_false(cookie.secure),
1557 str_or_none(cookie.expires, default=''),
1558 name, value
1559 )))
1560
1561 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1562 """
1563 Save cookies to a file.
24146491 1564 Code is taken from CPython 3.6
1565 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1566
c380cc28
S
1567 if filename is None:
1568 if self.filename is not None:
1569 filename = self.filename
1570 else:
ac668111 1571 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1572
24146491 1573 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1574 for cookie in self:
1575 if cookie.expires is None:
1576 cookie.expires = 0
c380cc28 1577
d76fa1f3 1578 with self.open(filename, write=True) as f:
c380cc28 1579 f.write(self._HEADER)
24146491 1580 self._really_save(f, *args, **kwargs)
1bab3437
S
1581
1582 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1583 """Load cookies from a file."""
1584 if filename is None:
1585 if self.filename is not None:
1586 filename = self.filename
1587 else:
ac668111 1588 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1589
c380cc28
S
1590 def prepare_line(line):
1591 if line.startswith(self._HTTPONLY_PREFIX):
1592 line = line[len(self._HTTPONLY_PREFIX):]
1593 # comments and empty lines are fine
1594 if line.startswith('#') or not line.strip():
1595 return line
1596 cookie_list = line.split('\t')
1597 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1598 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1599 cookie = self._CookieFileEntry(*cookie_list)
1600 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1601 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1602 return line
1603
e7e62441 1604 cf = io.StringIO()
d76fa1f3 1605 with self.open(filename) as f:
e7e62441 1606 for line in f:
c380cc28
S
1607 try:
1608 cf.write(prepare_line(line))
ac668111 1609 except http.cookiejar.LoadError as e:
94aa0644 1610 if f'{line.strip()} '[0] in '[{"':
ac668111 1611 raise http.cookiejar.LoadError(
94aa0644 1612 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1613 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1614 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1615 continue
e7e62441 1616 cf.seek(0)
1617 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1618 # Session cookies are denoted by either `expires` field set to
1619 # an empty string or 0. MozillaCookieJar only recognizes the former
1620 # (see [1]). So we need force the latter to be recognized as session
1621 # cookies on our own.
1622 # Session cookies may be important for cookies-based authentication,
1623 # e.g. usually, when user does not check 'Remember me' check box while
1624 # logging in on a site, some important cookies are stored as session
1625 # cookies so that not recognizing them will result in failed login.
1626 # 1. https://bugs.python.org/issue17164
1627 for cookie in self:
1628 # Treat `expires=0` cookies as session cookies
1629 if cookie.expires == 0:
1630 cookie.expires = None
1631 cookie.discard = True
1632
1633
ac668111 1634class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1635 def __init__(self, cookiejar=None):
ac668111 1636 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1637
1638 def http_response(self, request, response):
ac668111 1639 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1640
ac668111 1641 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1642 https_response = http_response
1643
1644
ac668111 1645class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1646 """YoutubeDL redirect handler
1647
1648 The code is based on HTTPRedirectHandler implementation from CPython [1].
1649
1650 This redirect handler solves two issues:
1651 - ensures redirect URL is always unicode under python 2
1652 - introduces support for experimental HTTP response status code
1653 308 Permanent Redirect [2] used by some sites [3]
1654
1655 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1656 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1657 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1658 """
1659
ac668111 1660 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1661
1662 def redirect_request(self, req, fp, code, msg, headers, newurl):
1663 """Return a Request or None in response to a redirect.
1664
1665 This is called by the http_error_30x methods when a
1666 redirection response is received. If a redirection should
1667 take place, return a new Request to allow http_error_30x to
1668 perform the redirect. Otherwise, raise HTTPError if no-one
1669 else should try to handle this url. Return None if you can't
1670 but another Handler might.
1671 """
1672 m = req.get_method()
1673 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1674 or code in (301, 302, 303) and m == "POST")):
14f25df2 1675 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1676 # Strictly (according to RFC 2616), 301 or 302 in response to
1677 # a POST MUST NOT cause a redirection without confirmation
1678 # from the user (of urllib.request, in this case). In practice,
1679 # essentially all clients do redirect in this case, so we do
1680 # the same.
1681
201c1459 1682 # Be conciliant with URIs containing a space. This is mainly
1683 # redundant with the more complete encoding done in http_error_302(),
1684 # but it is kept for compatibility with other callers.
1685 newurl = newurl.replace(' ', '%20')
1686
1687 CONTENT_HEADERS = ("content-length", "content-type")
1688 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1689 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1690
1691 # A 303 must either use GET or HEAD for subsequent request
1692 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1693 if code == 303 and m != 'HEAD':
1694 m = 'GET'
1695 # 301 and 302 redirects are commonly turned into a GET from a POST
1696 # for subsequent requests by browsers, so we'll do the same.
1697 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1698 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1699 if code in (301, 302) and m == 'POST':
1700 m = 'GET'
1701
ac668111 1702 return urllib.request.Request(
201c1459 1703 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1704 unverifiable=True, method=m)
fca6dba8
S
1705
1706
46f59e89
S
1707def extract_timezone(date_str):
1708 m = re.search(
f137e4c2 1709 r'''(?x)
1710 ^.{8,}? # >=8 char non-TZ prefix, if present
1711 (?P<tz>Z| # just the UTC Z, or
1712 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1713 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1714 [ ]? # optional space
1715 (?P<sign>\+|-) # +/-
1716 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1717 $)
1718 ''', date_str)
46f59e89 1719 if not m:
8f53dc44 1720 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1721 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1722 if timezone is not None:
1723 date_str = date_str[:-len(m.group('tz'))]
1724 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1725 else:
1726 date_str = date_str[:-len(m.group('tz'))]
1727 if not m.group('sign'):
1728 timezone = datetime.timedelta()
1729 else:
1730 sign = 1 if m.group('sign') == '+' else -1
1731 timezone = datetime.timedelta(
1732 hours=sign * int(m.group('hours')),
1733 minutes=sign * int(m.group('minutes')))
1734 return timezone, date_str
1735
1736
08b38d54 1737def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1738 """ Return a UNIX timestamp from the given date """
1739
1740 if date_str is None:
1741 return None
1742
52c3a6e4
S
1743 date_str = re.sub(r'\.[0-9]+', '', date_str)
1744
08b38d54 1745 if timezone is None:
46f59e89
S
1746 timezone, date_str = extract_timezone(date_str)
1747
19a03940 1748 with contextlib.suppress(ValueError):
86e5f3ed 1749 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1750 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1751 return calendar.timegm(dt.timetuple())
912b38b4
PH
1752
1753
46f59e89
S
1754def date_formats(day_first=True):
1755 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1756
1757
42bdd9d0 1758def unified_strdate(date_str, day_first=True):
bf50b038 1759 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1760
1761 if date_str is None:
1762 return None
bf50b038 1763 upload_date = None
5f6a1245 1764 # Replace commas
026fcc04 1765 date_str = date_str.replace(',', ' ')
42bdd9d0 1766 # Remove AM/PM + timezone
9bb8e0a3 1767 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1768 _, date_str = extract_timezone(date_str)
42bdd9d0 1769
46f59e89 1770 for expression in date_formats(day_first):
19a03940 1771 with contextlib.suppress(ValueError):
bf50b038 1772 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1773 if upload_date is None:
1774 timetuple = email.utils.parsedate_tz(date_str)
1775 if timetuple:
19a03940 1776 with contextlib.suppress(ValueError):
c6b9cf05 1777 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1778 if upload_date is not None:
14f25df2 1779 return str(upload_date)
bf50b038 1780
5f6a1245 1781
46f59e89
S
1782def unified_timestamp(date_str, day_first=True):
1783 if date_str is None:
1784 return None
1785
8f53dc44 1786 date_str = re.sub(r'\s+', ' ', re.sub(
1787 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1788
7dc2a74e 1789 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1790 timezone, date_str = extract_timezone(date_str)
1791
1792 # Remove AM/PM + timezone
1793 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1794
deef3195
S
1795 # Remove unrecognized timezones from ISO 8601 alike timestamps
1796 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1797 if m:
1798 date_str = date_str[:-len(m.group('tz'))]
1799
f226880c
PH
1800 # Python only supports microseconds, so remove nanoseconds
1801 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1802 if m:
1803 date_str = m.group(1)
1804
46f59e89 1805 for expression in date_formats(day_first):
19a03940 1806 with contextlib.suppress(ValueError):
7dc2a74e 1807 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1808 return calendar.timegm(dt.timetuple())
8f53dc44 1809
46f59e89
S
1810 timetuple = email.utils.parsedate_tz(date_str)
1811 if timetuple:
8f53dc44 1812 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1813
1814
28e614de 1815def determine_ext(url, default_ext='unknown_video'):
85750f89 1816 if url is None or '.' not in url:
f4776371 1817 return default_ext
9cb9a5df 1818 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1819 if re.match(r'^[A-Za-z0-9]+$', guess):
1820 return guess
a7aaa398
S
1821 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1822 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1823 return guess.rstrip('/')
73e79f2a 1824 else:
cbdbb766 1825 return default_ext
73e79f2a 1826
5f6a1245 1827
824fa511
S
1828def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1829 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1830
5f6a1245 1831
9e62f283 1832def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1833 R"""
1834 Return a datetime object from a string.
1835 Supported format:
1836 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1837
1838 @param format strftime format of DATE
1839 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1840 auto: round to the unit provided in date_str (if applicable).
9e62f283 1841 """
1842 auto_precision = False
1843 if precision == 'auto':
1844 auto_precision = True
1845 precision = 'microsecond'
396a76f7 1846 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1847 if date_str in ('now', 'today'):
37254abc 1848 return today
f8795e10
PH
1849 if date_str == 'yesterday':
1850 return today - datetime.timedelta(days=1)
9e62f283 1851 match = re.match(
3d38b2d6 1852 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1853 date_str)
37254abc 1854 if match is not None:
9e62f283 1855 start_time = datetime_from_str(match.group('start'), precision, format)
1856 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1857 unit = match.group('unit')
9e62f283 1858 if unit == 'month' or unit == 'year':
1859 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1860 unit = 'day'
9e62f283 1861 else:
1862 if unit == 'week':
1863 unit = 'day'
1864 time *= 7
1865 delta = datetime.timedelta(**{unit + 's': time})
1866 new_date = start_time + delta
1867 if auto_precision:
1868 return datetime_round(new_date, unit)
1869 return new_date
1870
1871 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1872
1873
d49f8db3 1874def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1875 R"""
1876 Return a date object from a string using datetime_from_str
9e62f283 1877
3d38b2d6 1878 @param strict Restrict allowed patterns to "YYYYMMDD" and
1879 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1880 """
3d38b2d6 1881 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1882 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1883 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1884
1885
1886def datetime_add_months(dt, months):
1887 """Increment/Decrement a datetime object by months."""
1888 month = dt.month + months - 1
1889 year = dt.year + month // 12
1890 month = month % 12 + 1
1891 day = min(dt.day, calendar.monthrange(year, month)[1])
1892 return dt.replace(year, month, day)
1893
1894
1895def datetime_round(dt, precision='day'):
1896 """
1897 Round a datetime object's time to a specific precision
1898 """
1899 if precision == 'microsecond':
1900 return dt
1901
1902 unit_seconds = {
1903 'day': 86400,
1904 'hour': 3600,
1905 'minute': 60,
1906 'second': 1,
1907 }
1908 roundto = lambda x, n: ((x + n / 2) // n) * n
1909 timestamp = calendar.timegm(dt.timetuple())
1910 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1911
1912
e63fc1be 1913def hyphenate_date(date_str):
1914 """
1915 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1916 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1917 if match is not None:
1918 return '-'.join(match.groups())
1919 else:
1920 return date_str
1921
5f6a1245 1922
86e5f3ed 1923class DateRange:
bd558525 1924 """Represents a time interval between two dates"""
5f6a1245 1925
bd558525
JMF
1926 def __init__(self, start=None, end=None):
1927 """start and end must be strings in the format accepted by date"""
1928 if start is not None:
d49f8db3 1929 self.start = date_from_str(start, strict=True)
bd558525
JMF
1930 else:
1931 self.start = datetime.datetime.min.date()
1932 if end is not None:
d49f8db3 1933 self.end = date_from_str(end, strict=True)
bd558525
JMF
1934 else:
1935 self.end = datetime.datetime.max.date()
37254abc 1936 if self.start > self.end:
bd558525 1937 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1938
bd558525
JMF
1939 @classmethod
1940 def day(cls, day):
1941 """Returns a range that only contains the given day"""
5f6a1245
JW
1942 return cls(day, day)
1943
bd558525
JMF
1944 def __contains__(self, date):
1945 """Check if the date is in the range"""
37254abc
JMF
1946 if not isinstance(date, datetime.date):
1947 date = date_from_str(date)
1948 return self.start <= date <= self.end
5f6a1245 1949
bd558525 1950 def __str__(self):
86e5f3ed 1951 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1952
f2df4071 1953 def __eq__(self, other):
1954 return (isinstance(other, DateRange)
1955 and self.start == other.start and self.end == other.end)
1956
c496ca96
PH
1957
1958def platform_name():
14f25df2 1959 """ Returns the platform name as a str """
da4db748 1960 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
b1f94422 1961 return platform.platform()
c496ca96 1962
b1f94422 1963
1964@functools.cache
1965def system_identifier():
1966 python_implementation = platform.python_implementation()
1967 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1968 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
1969
1970 return 'Python %s (%s %s) - %s %s' % (
1971 platform.python_version(),
1972 python_implementation,
1973 platform.architecture()[0],
1974 platform.platform(),
1975 format_field(join_nonempty(*platform.libc_ver(), delim=' '), None, '(%s)'),
1976 )
c257baff
PH
1977
1978
0b9c08b4 1979@functools.cache
49fa4d9a 1980def get_windows_version():
8a82af35 1981 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1982 if compat_os_name == 'nt':
1983 return version_tuple(platform.win32_ver()[1])
1984 else:
8a82af35 1985 return ()
49fa4d9a
N
1986
1987
734f90bb 1988def write_string(s, out=None, encoding=None):
19a03940 1989 assert isinstance(s, str)
1990 out = out or sys.stderr
7459e3a2 1991
fe1daad3 1992 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1993 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1994
8a82af35 1995 enc, buffer = None, out
cfb0511d 1996 if 'b' in getattr(out, 'mode', ''):
c487cf00 1997 enc = encoding or preferredencoding()
104aa738 1998 elif hasattr(out, 'buffer'):
8a82af35 1999 buffer = out.buffer
104aa738 2000 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2001
8a82af35 2002 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2003 out.flush()
2004
2005
da4db748 2006def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2007 from . import _IN_CLI
2008 if _IN_CLI:
2009 if msg in deprecation_warning._cache:
2010 return
2011 deprecation_warning._cache.add(msg)
2012 if printer:
2013 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2014 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2015 else:
2016 import warnings
2017 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2018
2019
2020deprecation_warning._cache = set()
2021
2022
48ea9cea
PH
2023def bytes_to_intlist(bs):
2024 if not bs:
2025 return []
2026 if isinstance(bs[0], int): # Python 3
2027 return list(bs)
2028 else:
2029 return [ord(c) for c in bs]
2030
c257baff 2031
cba892fa 2032def intlist_to_bytes(xs):
2033 if not xs:
2034 return b''
ac668111 2035 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2036
2037
8a82af35 2038class LockingUnsupportedError(OSError):
1890fc63 2039 msg = 'File locking is not supported'
0edb3e33 2040
2041 def __init__(self):
2042 super().__init__(self.msg)
2043
2044
c1c9a79c
PH
2045# Cross-platform file locking
2046if sys.platform == 'win32':
fe0918bb 2047 import ctypes
c1c9a79c
PH
2048 import ctypes.wintypes
2049 import msvcrt
2050
2051 class OVERLAPPED(ctypes.Structure):
2052 _fields_ = [
2053 ('Internal', ctypes.wintypes.LPVOID),
2054 ('InternalHigh', ctypes.wintypes.LPVOID),
2055 ('Offset', ctypes.wintypes.DWORD),
2056 ('OffsetHigh', ctypes.wintypes.DWORD),
2057 ('hEvent', ctypes.wintypes.HANDLE),
2058 ]
2059
2060 kernel32 = ctypes.windll.kernel32
2061 LockFileEx = kernel32.LockFileEx
2062 LockFileEx.argtypes = [
2063 ctypes.wintypes.HANDLE, # hFile
2064 ctypes.wintypes.DWORD, # dwFlags
2065 ctypes.wintypes.DWORD, # dwReserved
2066 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2067 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2068 ctypes.POINTER(OVERLAPPED) # Overlapped
2069 ]
2070 LockFileEx.restype = ctypes.wintypes.BOOL
2071 UnlockFileEx = kernel32.UnlockFileEx
2072 UnlockFileEx.argtypes = [
2073 ctypes.wintypes.HANDLE, # hFile
2074 ctypes.wintypes.DWORD, # dwReserved
2075 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2076 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2077 ctypes.POINTER(OVERLAPPED) # Overlapped
2078 ]
2079 UnlockFileEx.restype = ctypes.wintypes.BOOL
2080 whole_low = 0xffffffff
2081 whole_high = 0x7fffffff
2082
747c0bd1 2083 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2084 overlapped = OVERLAPPED()
2085 overlapped.Offset = 0
2086 overlapped.OffsetHigh = 0
2087 overlapped.hEvent = 0
2088 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2089
2090 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2091 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2092 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2093 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2094 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2095
2096 def _unlock_file(f):
2097 assert f._lock_file_overlapped_p
2098 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2099 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2100 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2101
2102else:
399a76e6
YCH
2103 try:
2104 import fcntl
c1c9a79c 2105
a3125791 2106 def _lock_file(f, exclusive, block):
b63837bc 2107 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2108 if not block:
2109 flags |= fcntl.LOCK_NB
acea8d7c 2110 try:
b63837bc 2111 fcntl.flock(f, flags)
acea8d7c
JK
2112 except BlockingIOError:
2113 raise
2114 except OSError: # AOSP does not have flock()
b63837bc 2115 fcntl.lockf(f, flags)
c1c9a79c 2116
399a76e6 2117 def _unlock_file(f):
acea8d7c
JK
2118 try:
2119 fcntl.flock(f, fcntl.LOCK_UN)
2120 except OSError:
2121 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2122
399a76e6 2123 except ImportError:
399a76e6 2124
a3125791 2125 def _lock_file(f, exclusive, block):
0edb3e33 2126 raise LockingUnsupportedError()
399a76e6
YCH
2127
2128 def _unlock_file(f):
0edb3e33 2129 raise LockingUnsupportedError()
c1c9a79c
PH
2130
2131
86e5f3ed 2132class locked_file:
0edb3e33 2133 locked = False
747c0bd1 2134
a3125791 2135 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2136 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2137 raise NotImplementedError(mode)
2138 self.mode, self.block = mode, block
2139
2140 writable = any(f in mode for f in 'wax+')
2141 readable = any(f in mode for f in 'r+')
2142 flags = functools.reduce(operator.ior, (
2143 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2144 getattr(os, 'O_BINARY', 0), # Windows only
2145 getattr(os, 'O_NOINHERIT', 0), # Windows only
2146 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2147 os.O_APPEND if 'a' in mode else 0,
2148 os.O_EXCL if 'x' in mode else 0,
2149 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2150 ))
2151
98804d03 2152 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2153
2154 def __enter__(self):
a3125791 2155 exclusive = 'r' not in self.mode
c1c9a79c 2156 try:
a3125791 2157 _lock_file(self.f, exclusive, self.block)
0edb3e33 2158 self.locked = True
86e5f3ed 2159 except OSError:
c1c9a79c
PH
2160 self.f.close()
2161 raise
fcfa8853 2162 if 'w' in self.mode:
131e14dc
JK
2163 try:
2164 self.f.truncate()
2165 except OSError as e:
1890fc63 2166 if e.errno not in (
2167 errno.ESPIPE, # Illegal seek - expected for FIFO
2168 errno.EINVAL, # Invalid argument - expected for /dev/null
2169 ):
2170 raise
c1c9a79c
PH
2171 return self
2172
0edb3e33 2173 def unlock(self):
2174 if not self.locked:
2175 return
c1c9a79c 2176 try:
0edb3e33 2177 _unlock_file(self.f)
c1c9a79c 2178 finally:
0edb3e33 2179 self.locked = False
c1c9a79c 2180
0edb3e33 2181 def __exit__(self, *_):
2182 try:
2183 self.unlock()
2184 finally:
2185 self.f.close()
4eb7f1d1 2186
0edb3e33 2187 open = __enter__
2188 close = __exit__
a3125791 2189
0edb3e33 2190 def __getattr__(self, attr):
2191 return getattr(self.f, attr)
a3125791 2192
0edb3e33 2193 def __iter__(self):
2194 return iter(self.f)
a3125791 2195
4eb7f1d1 2196
0b9c08b4 2197@functools.cache
4644ac55
S
2198def get_filesystem_encoding():
2199 encoding = sys.getfilesystemencoding()
2200 return encoding if encoding is not None else 'utf-8'
2201
2202
4eb7f1d1 2203def shell_quote(args):
a6a173c2 2204 quoted_args = []
4644ac55 2205 encoding = get_filesystem_encoding()
a6a173c2
JMF
2206 for a in args:
2207 if isinstance(a, bytes):
2208 # We may get a filename encoded with 'encodeFilename'
2209 a = a.decode(encoding)
aefce8e6 2210 quoted_args.append(compat_shlex_quote(a))
28e614de 2211 return ' '.join(quoted_args)
9d4660ca
PH
2212
2213
2214def smuggle_url(url, data):
2215 """ Pass additional data in a URL for internal use. """
2216
81953d1a
RA
2217 url, idata = unsmuggle_url(url, {})
2218 data.update(idata)
14f25df2 2219 sdata = urllib.parse.urlencode(
28e614de
PH
2220 {'__youtubedl_smuggle': json.dumps(data)})
2221 return url + '#' + sdata
9d4660ca
PH
2222
2223
79f82953 2224def unsmuggle_url(smug_url, default=None):
83e865a3 2225 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2226 return smug_url, default
28e614de 2227 url, _, sdata = smug_url.rpartition('#')
14f25df2 2228 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2229 data = json.loads(jsond)
2230 return url, data
02dbf93f
PH
2231
2232
e0fd9573 2233def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2234 """ Formats numbers with decimal sufixes like K, M, etc """
2235 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2236 if num is None or num < 0:
e0fd9573 2237 return None
eeb2a770 2238 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2239 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2240 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2241 if factor == 1024:
2242 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2243 converted = num / (factor ** exponent)
abbeeebc 2244 return fmt % (converted, suffix)
e0fd9573 2245
2246
02dbf93f 2247def format_bytes(bytes):
f02d24d8 2248 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2249
1c088fa8 2250
fb47597b
S
2251def lookup_unit_table(unit_table, s):
2252 units_re = '|'.join(re.escape(u) for u in unit_table)
2253 m = re.match(
782b1b5b 2254 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2255 if not m:
2256 return None
2257 num_str = m.group('num').replace(',', '.')
2258 mult = unit_table[m.group('unit')]
2259 return int(float(num_str) * mult)
2260
2261
be64b5b0
PH
2262def parse_filesize(s):
2263 if s is None:
2264 return None
2265
dfb1b146 2266 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2267 # but we support those too
2268 _UNIT_TABLE = {
2269 'B': 1,
2270 'b': 1,
70852b47 2271 'bytes': 1,
be64b5b0
PH
2272 'KiB': 1024,
2273 'KB': 1000,
2274 'kB': 1024,
2275 'Kb': 1000,
13585d76 2276 'kb': 1000,
70852b47
YCH
2277 'kilobytes': 1000,
2278 'kibibytes': 1024,
be64b5b0
PH
2279 'MiB': 1024 ** 2,
2280 'MB': 1000 ** 2,
2281 'mB': 1024 ** 2,
2282 'Mb': 1000 ** 2,
13585d76 2283 'mb': 1000 ** 2,
70852b47
YCH
2284 'megabytes': 1000 ** 2,
2285 'mebibytes': 1024 ** 2,
be64b5b0
PH
2286 'GiB': 1024 ** 3,
2287 'GB': 1000 ** 3,
2288 'gB': 1024 ** 3,
2289 'Gb': 1000 ** 3,
13585d76 2290 'gb': 1000 ** 3,
70852b47
YCH
2291 'gigabytes': 1000 ** 3,
2292 'gibibytes': 1024 ** 3,
be64b5b0
PH
2293 'TiB': 1024 ** 4,
2294 'TB': 1000 ** 4,
2295 'tB': 1024 ** 4,
2296 'Tb': 1000 ** 4,
13585d76 2297 'tb': 1000 ** 4,
70852b47
YCH
2298 'terabytes': 1000 ** 4,
2299 'tebibytes': 1024 ** 4,
be64b5b0
PH
2300 'PiB': 1024 ** 5,
2301 'PB': 1000 ** 5,
2302 'pB': 1024 ** 5,
2303 'Pb': 1000 ** 5,
13585d76 2304 'pb': 1000 ** 5,
70852b47
YCH
2305 'petabytes': 1000 ** 5,
2306 'pebibytes': 1024 ** 5,
be64b5b0
PH
2307 'EiB': 1024 ** 6,
2308 'EB': 1000 ** 6,
2309 'eB': 1024 ** 6,
2310 'Eb': 1000 ** 6,
13585d76 2311 'eb': 1000 ** 6,
70852b47
YCH
2312 'exabytes': 1000 ** 6,
2313 'exbibytes': 1024 ** 6,
be64b5b0
PH
2314 'ZiB': 1024 ** 7,
2315 'ZB': 1000 ** 7,
2316 'zB': 1024 ** 7,
2317 'Zb': 1000 ** 7,
13585d76 2318 'zb': 1000 ** 7,
70852b47
YCH
2319 'zettabytes': 1000 ** 7,
2320 'zebibytes': 1024 ** 7,
be64b5b0
PH
2321 'YiB': 1024 ** 8,
2322 'YB': 1000 ** 8,
2323 'yB': 1024 ** 8,
2324 'Yb': 1000 ** 8,
13585d76 2325 'yb': 1000 ** 8,
70852b47
YCH
2326 'yottabytes': 1000 ** 8,
2327 'yobibytes': 1024 ** 8,
be64b5b0
PH
2328 }
2329
fb47597b
S
2330 return lookup_unit_table(_UNIT_TABLE, s)
2331
2332
2333def parse_count(s):
2334 if s is None:
be64b5b0
PH
2335 return None
2336
352d5da8 2337 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2338
2339 if re.match(r'^[\d,.]+$', s):
2340 return str_to_int(s)
2341
2342 _UNIT_TABLE = {
2343 'k': 1000,
2344 'K': 1000,
2345 'm': 1000 ** 2,
2346 'M': 1000 ** 2,
2347 'kk': 1000 ** 2,
2348 'KK': 1000 ** 2,
352d5da8 2349 'b': 1000 ** 3,
2350 'B': 1000 ** 3,
fb47597b 2351 }
be64b5b0 2352
352d5da8 2353 ret = lookup_unit_table(_UNIT_TABLE, s)
2354 if ret is not None:
2355 return ret
2356
2357 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2358 if mobj:
2359 return str_to_int(mobj.group(1))
be64b5b0 2360
2f7ae819 2361
5d45484c 2362def parse_resolution(s, *, lenient=False):
b871d7e9
S
2363 if s is None:
2364 return {}
2365
5d45484c
LNO
2366 if lenient:
2367 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2368 else:
2369 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2370 if mobj:
2371 return {
2372 'width': int(mobj.group('w')),
2373 'height': int(mobj.group('h')),
2374 }
2375
17ec8bcf 2376 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2377 if mobj:
2378 return {'height': int(mobj.group(1))}
2379
2380 mobj = re.search(r'\b([48])[kK]\b', s)
2381 if mobj:
2382 return {'height': int(mobj.group(1)) * 540}
2383
2384 return {}
2385
2386
0dc41787 2387def parse_bitrate(s):
14f25df2 2388 if not isinstance(s, str):
0dc41787
S
2389 return
2390 mobj = re.search(r'\b(\d+)\s*kbps', s)
2391 if mobj:
2392 return int(mobj.group(1))
2393
2394
a942d6cb 2395def month_by_name(name, lang='en'):
caefb1de
PH
2396 """ Return the number of a month by (locale-independently) English name """
2397
f6717dec 2398 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2399
caefb1de 2400 try:
f6717dec 2401 return month_names.index(name) + 1
7105440c
YCH
2402 except ValueError:
2403 return None
2404
2405
2406def month_by_abbreviation(abbrev):
2407 """ Return the number of a month by (locale-independently) English
2408 abbreviations """
2409
2410 try:
2411 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2412 except ValueError:
2413 return None
18258362
JMF
2414
2415
5aafe895 2416def fix_xml_ampersands(xml_str):
18258362 2417 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2418 return re.sub(
2419 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2420 '&amp;',
5aafe895 2421 xml_str)
e3946f98
PH
2422
2423
2424def setproctitle(title):
14f25df2 2425 assert isinstance(title, str)
c1c05c67 2426
fe0918bb 2427 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2428 try:
2429 import ctypes
2430 except ImportError:
c1c05c67
YCH
2431 return
2432
e3946f98 2433 try:
611c1dd9 2434 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2435 except OSError:
2436 return
2f49bcd6
RC
2437 except TypeError:
2438 # LoadLibrary in Windows Python 2.7.13 only expects
2439 # a bytestring, but since unicode_literals turns
2440 # every string into a unicode string, it fails.
2441 return
0f06bcd7 2442 title_bytes = title.encode()
6eefe533
PH
2443 buf = ctypes.create_string_buffer(len(title_bytes))
2444 buf.value = title_bytes
e3946f98 2445 try:
6eefe533 2446 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2447 except AttributeError:
2448 return # Strange libc, just skip this
d7dda168
PH
2449
2450
2451def remove_start(s, start):
46bc9b7d 2452 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2453
2454
2b9faf55 2455def remove_end(s, end):
46bc9b7d 2456 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2457
2458
31b2051e
S
2459def remove_quotes(s):
2460 if s is None or len(s) < 2:
2461 return s
2462 for quote in ('"', "'", ):
2463 if s[0] == quote and s[-1] == quote:
2464 return s[1:-1]
2465 return s
2466
2467
b6e0c7d2 2468def get_domain(url):
ebf99aaf 2469 """
2470 This implementation is inconsistent, but is kept for compatibility.
2471 Use this only for "webpage_url_domain"
2472 """
2473 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2474
2475
29eb5174 2476def url_basename(url):
14f25df2 2477 path = urllib.parse.urlparse(url).path
28e614de 2478 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2479
2480
02dc0a36 2481def base_url(url):
7657ec7e 2482 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2483
2484
e34c3361 2485def urljoin(base, path):
4b5de77b 2486 if isinstance(path, bytes):
0f06bcd7 2487 path = path.decode()
14f25df2 2488 if not isinstance(path, str) or not path:
e34c3361 2489 return None
fad4ceb5 2490 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2491 return path
4b5de77b 2492 if isinstance(base, bytes):
0f06bcd7 2493 base = base.decode()
14f25df2 2494 if not isinstance(base, str) or not re.match(
4b5de77b 2495 r'^(?:https?:)?//', base):
e34c3361 2496 return None
14f25df2 2497 return urllib.parse.urljoin(base, path)
e34c3361
S
2498
2499
ac668111 2500class HEADRequest(urllib.request.Request):
aa94a6d3 2501 def get_method(self):
611c1dd9 2502 return 'HEAD'
7217e148
PH
2503
2504
ac668111 2505class PUTRequest(urllib.request.Request):
95cf60e8
S
2506 def get_method(self):
2507 return 'PUT'
2508
2509
9732d77e 2510def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2511 if get_attr and v is not None:
2512 v = getattr(v, get_attr, None)
1812afb7
S
2513 try:
2514 return int(v) * invscale // scale
31c49255 2515 except (ValueError, TypeError, OverflowError):
af98f8ff 2516 return default
9732d77e 2517
9572013d 2518
40a90862 2519def str_or_none(v, default=None):
14f25df2 2520 return default if v is None else str(v)
40a90862 2521
9732d77e
PH
2522
2523def str_to_int(int_str):
48d4681e 2524 """ A more relaxed version of int_or_none """
f9934b96 2525 if isinstance(int_str, int):
348c6bf1 2526 return int_str
14f25df2 2527 elif isinstance(int_str, str):
42db58ec
S
2528 int_str = re.sub(r'[,\.\+]', '', int_str)
2529 return int_or_none(int_str)
608d11f5
PH
2530
2531
9732d77e 2532def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2533 if v is None:
2534 return default
2535 try:
2536 return float(v) * invscale / scale
5e1271c5 2537 except (ValueError, TypeError):
caf80631 2538 return default
43f775e4
PH
2539
2540
c7e327c4
S
2541def bool_or_none(v, default=None):
2542 return v if isinstance(v, bool) else default
2543
2544
53cd37ba 2545def strip_or_none(v, default=None):
14f25df2 2546 return v.strip() if isinstance(v, str) else default
b72b4431
S
2547
2548
af03000a 2549def url_or_none(url):
14f25df2 2550 if not url or not isinstance(url, str):
af03000a
S
2551 return None
2552 url = url.strip()
29f7c58a 2553 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2554
2555
3e9b66d7 2556def request_to_url(req):
ac668111 2557 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2558 return req.get_full_url()
2559 else:
2560 return req
2561
2562
e29663c6 2563def strftime_or_none(timestamp, date_format, default=None):
2564 datetime_object = None
2565 try:
f9934b96 2566 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2567 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
14f25df2 2568 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2569 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2570 return datetime_object.strftime(date_format)
2571 except (ValueError, TypeError, AttributeError):
2572 return default
2573
2574
608d11f5 2575def parse_duration(s):
f9934b96 2576 if not isinstance(s, str):
608d11f5 2577 return None
ca7b3246 2578 s = s.strip()
38d79fd1 2579 if not s:
2580 return None
ca7b3246 2581
acaff495 2582 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2583 m = re.match(r'''(?x)
2584 (?P<before_secs>
2585 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2586 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2587 (?P<ms>[.:][0-9]+)?Z?$
2588 ''', s)
acaff495 2589 if m:
8bd1c00b 2590 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2591 else:
2592 m = re.match(
056653bb
S
2593 r'''(?ix)(?:P?
2594 (?:
1c1b2f96 2595 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2596 )?
2597 (?:
1c1b2f96 2598 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2599 )?
2600 (?:
1c1b2f96 2601 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2602 )?
8f4b58d7 2603 (?:
1c1b2f96 2604 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2605 )?
056653bb 2606 T)?
acaff495 2607 (?:
1c1b2f96 2608 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2609 )?
2610 (?:
1c1b2f96 2611 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2612 )?
2613 (?:
2614 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2615 )?Z?$''', s)
acaff495 2616 if m:
2617 days, hours, mins, secs, ms = m.groups()
2618 else:
15846398 2619 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2620 if m:
2621 hours, mins = m.groups()
2622 else:
2623 return None
2624
acaff495 2625 if ms:
19a03940 2626 ms = ms.replace(':', '.')
2627 return sum(float(part or 0) * mult for part, mult in (
2628 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2629
2630
e65e4c88 2631def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2632 name, real_ext = os.path.splitext(filename)
e65e4c88 2633 return (
86e5f3ed 2634 f'{name}.{ext}{real_ext}'
e65e4c88 2635 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2636 else f'{filename}.{ext}')
d70ad093
PH
2637
2638
b3ed15b7
S
2639def replace_extension(filename, ext, expected_real_ext=None):
2640 name, real_ext = os.path.splitext(filename)
86e5f3ed 2641 return '{}.{}'.format(
b3ed15b7
S
2642 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2643 ext)
2644
2645
d70ad093
PH
2646def check_executable(exe, args=[]):
2647 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2648 args can be a list of arguments for a short output (like -version) """
2649 try:
f0c9fb96 2650 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2651 except OSError:
2652 return False
2653 return exe
b7ab0590
PH
2654
2655
8a7f68d0 2656def _get_exe_version_output(exe, args, *, to_screen=None):
2657 if to_screen:
2658 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2659 try:
b64d04c1 2660 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2661 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2662 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2663 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2664 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2665 except OSError:
2666 return False
f0c9fb96 2667 return stdout
cae97f65
PH
2668
2669
2670def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2671 assert isinstance(output, str)
cae97f65
PH
2672 if version_re is None:
2673 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2674 m = re.search(version_re, output)
95807118
PH
2675 if m:
2676 return m.group(1)
2677 else:
2678 return unrecognized
2679
2680
9af98e17 2681def get_exe_version(exe, args=['--version'],
2682 version_re=None, unrecognized='present'):
2683 """ Returns the version of the specified executable,
2684 or False if the executable is not present """
2685 out = _get_exe_version_output(exe, args)
2686 return detect_exe_version(out, version_re, unrecognized) if out else False
2687
2688
7e88d7d7 2689def frange(start=0, stop=None, step=1):
2690 """Float range"""
2691 if stop is None:
2692 start, stop = 0, start
2693 sign = [-1, 1][step > 0] if step else 0
2694 while sign * start < sign * stop:
2695 yield start
2696 start += step
2697
2698
cb89cfc1 2699class LazyList(collections.abc.Sequence):
0f06bcd7 2700 """Lazy immutable list from an iterable
2701 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2702
8e5fecc8 2703 class IndexError(IndexError):
2704 pass
2705
282f5709 2706 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2707 self._iterable = iter(iterable)
2708 self._cache = [] if _cache is None else _cache
2709 self._reversed = reverse
483336e7 2710
2711 def __iter__(self):
0f06bcd7 2712 if self._reversed:
28419ca2 2713 # We need to consume the entire iterable to iterate in reverse
981052c9 2714 yield from self.exhaust()
28419ca2 2715 return
0f06bcd7 2716 yield from self._cache
2717 for item in self._iterable:
2718 self._cache.append(item)
483336e7 2719 yield item
2720
0f06bcd7 2721 def _exhaust(self):
2722 self._cache.extend(self._iterable)
2723 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2724 return self._cache
28419ca2 2725
981052c9 2726 def exhaust(self):
0f06bcd7 2727 """Evaluate the entire iterable"""
2728 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2729
28419ca2 2730 @staticmethod
0f06bcd7 2731 def _reverse_index(x):
f2df4071 2732 return None if x is None else ~x
483336e7 2733
2734 def __getitem__(self, idx):
2735 if isinstance(idx, slice):
0f06bcd7 2736 if self._reversed:
2737 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2738 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2739 elif isinstance(idx, int):
0f06bcd7 2740 if self._reversed:
2741 idx = self._reverse_index(idx)
e0f2b4b4 2742 start, stop, step = idx, idx, 0
483336e7 2743 else:
2744 raise TypeError('indices must be integers or slices')
e0f2b4b4 2745 if ((start or 0) < 0 or (stop or 0) < 0
2746 or (start is None and step < 0)
2747 or (stop is None and step > 0)):
483336e7 2748 # We need to consume the entire iterable to be able to slice from the end
2749 # Obviously, never use this with infinite iterables
0f06bcd7 2750 self._exhaust()
8e5fecc8 2751 try:
0f06bcd7 2752 return self._cache[idx]
8e5fecc8 2753 except IndexError as e:
2754 raise self.IndexError(e) from e
0f06bcd7 2755 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2756 if n > 0:
0f06bcd7 2757 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2758 try:
0f06bcd7 2759 return self._cache[idx]
8e5fecc8 2760 except IndexError as e:
2761 raise self.IndexError(e) from e
483336e7 2762
2763 def __bool__(self):
2764 try:
0f06bcd7 2765 self[-1] if self._reversed else self[0]
8e5fecc8 2766 except self.IndexError:
483336e7 2767 return False
2768 return True
2769
2770 def __len__(self):
0f06bcd7 2771 self._exhaust()
2772 return len(self._cache)
483336e7 2773
282f5709 2774 def __reversed__(self):
0f06bcd7 2775 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2776
2777 def __copy__(self):
0f06bcd7 2778 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2779
28419ca2 2780 def __repr__(self):
2781 # repr and str should mimic a list. So we exhaust the iterable
2782 return repr(self.exhaust())
2783
2784 def __str__(self):
2785 return repr(self.exhaust())
2786
483336e7 2787
7be9ccff 2788class PagedList:
c07a39ae 2789
2790 class IndexError(IndexError):
2791 pass
2792
dd26ced1
PH
2793 def __len__(self):
2794 # This is only useful for tests
2795 return len(self.getslice())
2796
7be9ccff 2797 def __init__(self, pagefunc, pagesize, use_cache=True):
2798 self._pagefunc = pagefunc
2799 self._pagesize = pagesize
f1d13090 2800 self._pagecount = float('inf')
7be9ccff 2801 self._use_cache = use_cache
2802 self._cache = {}
2803
2804 def getpage(self, pagenum):
d8cf8d97 2805 page_results = self._cache.get(pagenum)
2806 if page_results is None:
f1d13090 2807 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2808 if self._use_cache:
2809 self._cache[pagenum] = page_results
2810 return page_results
2811
2812 def getslice(self, start=0, end=None):
2813 return list(self._getslice(start, end))
2814
2815 def _getslice(self, start, end):
55575225 2816 raise NotImplementedError('This method must be implemented by subclasses')
2817
2818 def __getitem__(self, idx):
f1d13090 2819 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2820 if not isinstance(idx, int) or idx < 0:
2821 raise TypeError('indices must be non-negative integers')
2822 entries = self.getslice(idx, idx + 1)
d8cf8d97 2823 if not entries:
c07a39ae 2824 raise self.IndexError()
d8cf8d97 2825 return entries[0]
55575225 2826
9c44d242
PH
2827
2828class OnDemandPagedList(PagedList):
a44ca5a4 2829 """Download pages until a page with less than maximum results"""
86e5f3ed 2830
7be9ccff 2831 def _getslice(self, start, end):
b7ab0590
PH
2832 for pagenum in itertools.count(start // self._pagesize):
2833 firstid = pagenum * self._pagesize
2834 nextfirstid = pagenum * self._pagesize + self._pagesize
2835 if start >= nextfirstid:
2836 continue
2837
b7ab0590
PH
2838 startv = (
2839 start % self._pagesize
2840 if firstid <= start < nextfirstid
2841 else 0)
b7ab0590
PH
2842 endv = (
2843 ((end - 1) % self._pagesize) + 1
2844 if (end is not None and firstid <= end <= nextfirstid)
2845 else None)
2846
f1d13090 2847 try:
2848 page_results = self.getpage(pagenum)
2849 except Exception:
2850 self._pagecount = pagenum - 1
2851 raise
b7ab0590
PH
2852 if startv != 0 or endv is not None:
2853 page_results = page_results[startv:endv]
7be9ccff 2854 yield from page_results
b7ab0590
PH
2855
2856 # A little optimization - if current page is not "full", ie. does
2857 # not contain page_size videos then we can assume that this page
2858 # is the last one - there are no more ids on further pages -
2859 # i.e. no need to query again.
2860 if len(page_results) + startv < self._pagesize:
2861 break
2862
2863 # If we got the whole page, but the next page is not interesting,
2864 # break out early as well
2865 if end == nextfirstid:
2866 break
81c2f20b
PH
2867
2868
9c44d242 2869class InAdvancePagedList(PagedList):
a44ca5a4 2870 """PagedList with total number of pages known in advance"""
86e5f3ed 2871
9c44d242 2872 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2873 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2874 self._pagecount = pagecount
9c44d242 2875
7be9ccff 2876 def _getslice(self, start, end):
9c44d242 2877 start_page = start // self._pagesize
d37707bd 2878 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2879 skip_elems = start - start_page * self._pagesize
2880 only_more = None if end is None else end - start
2881 for pagenum in range(start_page, end_page):
7be9ccff 2882 page_results = self.getpage(pagenum)
9c44d242 2883 if skip_elems:
7be9ccff 2884 page_results = page_results[skip_elems:]
9c44d242
PH
2885 skip_elems = None
2886 if only_more is not None:
7be9ccff 2887 if len(page_results) < only_more:
2888 only_more -= len(page_results)
9c44d242 2889 else:
7be9ccff 2890 yield from page_results[:only_more]
9c44d242 2891 break
7be9ccff 2892 yield from page_results
9c44d242
PH
2893
2894
7e88d7d7 2895class PlaylistEntries:
2896 MissingEntry = object()
2897 is_exhausted = False
2898
2899 def __init__(self, ydl, info_dict):
7e9a6125 2900 self.ydl = ydl
2901
2902 # _entries must be assigned now since infodict can change during iteration
2903 entries = info_dict.get('entries')
2904 if entries is None:
2905 raise EntryNotInPlaylist('There are no entries')
2906 elif isinstance(entries, list):
2907 self.is_exhausted = True
2908
2909 requested_entries = info_dict.get('requested_entries')
2910 self.is_incomplete = bool(requested_entries)
2911 if self.is_incomplete:
2912 assert self.is_exhausted
2913 self._entries = [self.MissingEntry] * max(requested_entries)
2914 for i, entry in zip(requested_entries, entries):
2915 self._entries[i - 1] = entry
2916 elif isinstance(entries, (list, PagedList, LazyList)):
2917 self._entries = entries
2918 else:
2919 self._entries = LazyList(entries)
7e88d7d7 2920
2921 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2922 (?P<start>[+-]?\d+)?
2923 (?P<range>[:-]
2924 (?P<end>[+-]?\d+|inf(?:inite)?)?
2925 (?::(?P<step>[+-]?\d+))?
2926 )?''')
2927
2928 @classmethod
2929 def parse_playlist_items(cls, string):
2930 for segment in string.split(','):
2931 if not segment:
2932 raise ValueError('There is two or more consecutive commas')
2933 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2934 if not mobj:
2935 raise ValueError(f'{segment!r} is not a valid specification')
2936 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2937 if int_or_none(step) == 0:
2938 raise ValueError(f'Step in {segment!r} cannot be zero')
2939 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2940
2941 def get_requested_items(self):
2942 playlist_items = self.ydl.params.get('playlist_items')
2943 playlist_start = self.ydl.params.get('playliststart', 1)
2944 playlist_end = self.ydl.params.get('playlistend')
2945 # For backwards compatibility, interpret -1 as whole list
2946 if playlist_end in (-1, None):
2947 playlist_end = ''
2948 if not playlist_items:
2949 playlist_items = f'{playlist_start}:{playlist_end}'
2950 elif playlist_start != 1 or playlist_end:
2951 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2952
2953 for index in self.parse_playlist_items(playlist_items):
2954 for i, entry in self[index]:
2955 yield i, entry
1ac4fd80 2956 if not entry:
2957 continue
7e88d7d7 2958 try:
2959 # TODO: Add auto-generated fields
2960 self.ydl._match_entry(entry, incomplete=True, silent=True)
2961 except (ExistingVideoReached, RejectedVideoReached):
2962 return
2963
7e9a6125 2964 def get_full_count(self):
2965 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2966 return len(self)
2967 elif isinstance(self._entries, InAdvancePagedList):
2968 if self._entries._pagesize == 1:
2969 return self._entries._pagecount
2970
7e88d7d7 2971 @functools.cached_property
2972 def _getter(self):
2973 if isinstance(self._entries, list):
2974 def get_entry(i):
2975 try:
2976 entry = self._entries[i]
2977 except IndexError:
2978 entry = self.MissingEntry
2979 if not self.is_incomplete:
2980 raise self.IndexError()
2981 if entry is self.MissingEntry:
2982 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2983 return entry
2984 else:
2985 def get_entry(i):
2986 try:
2987 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2988 except (LazyList.IndexError, PagedList.IndexError):
2989 raise self.IndexError()
2990 return get_entry
2991
2992 def __getitem__(self, idx):
2993 if isinstance(idx, int):
2994 idx = slice(idx, idx)
2995
2996 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2997 step = 1 if idx.step is None else idx.step
2998 if idx.start is None:
2999 start = 0 if step > 0 else len(self) - 1
3000 else:
3001 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3002
3003 # NB: Do not call len(self) when idx == [:]
3004 if idx.stop is None:
3005 stop = 0 if step < 0 else float('inf')
3006 else:
3007 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3008 stop += [-1, 1][step > 0]
3009
3010 for i in frange(start, stop, step):
3011 if i < 0:
3012 continue
3013 try:
7e9a6125 3014 entry = self._getter(i)
3015 except self.IndexError:
3016 self.is_exhausted = True
3017 if step > 0:
7e88d7d7 3018 break
7e9a6125 3019 continue
7e88d7d7 3020 yield i + 1, entry
3021
3022 def __len__(self):
3023 return len(tuple(self[:]))
3024
3025 class IndexError(IndexError):
3026 pass
3027
3028
81c2f20b 3029def uppercase_escape(s):
676eb3f2 3030 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3031 return re.sub(
a612753d 3032 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3033 lambda m: unicode_escape(m.group(0))[0],
3034 s)
0fe2ff78
YCH
3035
3036
3037def lowercase_escape(s):
3038 unicode_escape = codecs.getdecoder('unicode_escape')
3039 return re.sub(
3040 r'\\u[0-9a-fA-F]{4}',
3041 lambda m: unicode_escape(m.group(0))[0],
3042 s)
b53466e1 3043
d05cfe06
S
3044
3045def escape_rfc3986(s):
3046 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3047 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3048
3049
3050def escape_url(url):
3051 """Escape URL as suggested by RFC 3986"""
14f25df2 3052 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3053 return url_parsed._replace(
efbed08d 3054 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3055 path=escape_rfc3986(url_parsed.path),
3056 params=escape_rfc3986(url_parsed.params),
3057 query=escape_rfc3986(url_parsed.query),
3058 fragment=escape_rfc3986(url_parsed.fragment)
3059 ).geturl()
3060
62e609ab 3061
4dfbf869 3062def parse_qs(url):
14f25df2 3063 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
4dfbf869 3064
3065
62e609ab
PH
3066def read_batch_urls(batch_fd):
3067 def fixup(url):
14f25df2 3068 if not isinstance(url, str):
62e609ab 3069 url = url.decode('utf-8', 'replace')
8c04f0be 3070 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3071 for bom in BOM_UTF8:
3072 if url.startswith(bom):
3073 url = url[len(bom):]
3074 url = url.lstrip()
3075 if not url or url.startswith(('#', ';', ']')):
62e609ab 3076 return False
8c04f0be 3077 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3078 # However, it can be safely stripped out if following a whitespace
8c04f0be 3079 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3080
3081 with contextlib.closing(batch_fd) as fd:
3082 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3083
3084
3085def urlencode_postdata(*args, **kargs):
14f25df2 3086 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3087
3088
38f9ef31 3089def update_url_query(url, query):
cacd9966
YCH
3090 if not query:
3091 return url
14f25df2 3092 parsed_url = urllib.parse.urlparse(url)
3093 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3094 qs.update(query)
14f25df2 3095 return urllib.parse.urlunparse(parsed_url._replace(
3096 query=urllib.parse.urlencode(qs, True)))
16392824 3097
8e60dc75 3098
c043c246 3099def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3100 req_headers = req.headers.copy()
c043c246 3101 req_headers.update(headers or {})
ed0291d1
S
3102 req_data = data or req.data
3103 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3104 req_get_method = req.get_method()
3105 if req_get_method == 'HEAD':
3106 req_type = HEADRequest
3107 elif req_get_method == 'PUT':
3108 req_type = PUTRequest
3109 else:
ac668111 3110 req_type = urllib.request.Request
ed0291d1
S
3111 new_req = req_type(
3112 req_url, data=req_data, headers=req_headers,
3113 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3114 if hasattr(req, 'timeout'):
3115 new_req.timeout = req.timeout
3116 return new_req
3117
3118
10c87c15 3119def _multipart_encode_impl(data, boundary):
0c265486
YCH
3120 content_type = 'multipart/form-data; boundary=%s' % boundary
3121
3122 out = b''
3123 for k, v in data.items():
3124 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3125 if isinstance(k, str):
0f06bcd7 3126 k = k.encode()
14f25df2 3127 if isinstance(v, str):
0f06bcd7 3128 v = v.encode()
0c265486
YCH
3129 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3130 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3131 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3132 if boundary.encode('ascii') in content:
3133 raise ValueError('Boundary overlaps with data')
3134 out += content
3135
3136 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3137
3138 return out, content_type
3139
3140
3141def multipart_encode(data, boundary=None):
3142 '''
3143 Encode a dict to RFC 7578-compliant form-data
3144
3145 data:
3146 A dict where keys and values can be either Unicode or bytes-like
3147 objects.
3148 boundary:
3149 If specified a Unicode object, it's used as the boundary. Otherwise
3150 a random boundary is generated.
3151
3152 Reference: https://tools.ietf.org/html/rfc7578
3153 '''
3154 has_specified_boundary = boundary is not None
3155
3156 while True:
3157 if boundary is None:
3158 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3159
3160 try:
10c87c15 3161 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3162 break
3163 except ValueError:
3164 if has_specified_boundary:
3165 raise
3166 boundary = None
3167
3168 return out, content_type
3169
3170
86296ad2 3171def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3172 for val in map(d.get, variadic(key_or_keys)):
3173 if val is not None and (val or not skip_false_values):
3174 return val
3175 return default
cbecc9b9
S
3176
3177
c4f60dd7 3178def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3179 for f in funcs:
a32a9a7e 3180 try:
c4f60dd7 3181 val = f(*args, **kwargs)
3182 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
3183 pass
3184 else:
c4f60dd7 3185 if expected_type is None or isinstance(val, expected_type):
3186 return val
3187
3188
3189def try_get(src, getter, expected_type=None):
3190 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3191
3192
90137ca4 3193def filter_dict(dct, cndn=lambda _, v: v is not None):
3194 return {k: v for k, v in dct.items() if cndn(k, v)}
3195
3196
6cc62232
S
3197def merge_dicts(*dicts):
3198 merged = {}
3199 for a_dict in dicts:
3200 for k, v in a_dict.items():
90137ca4 3201 if (v is not None and k not in merged
3202 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3203 merged[k] = v
3204 return merged
3205
3206
8e60dc75 3207def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3208 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3209
16392824 3210
a1a530b0
PH
3211US_RATINGS = {
3212 'G': 0,
3213 'PG': 10,
3214 'PG-13': 13,
3215 'R': 16,
3216 'NC': 18,
3217}
fac55558
PH
3218
3219
a8795327 3220TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3221 'TV-Y': 0,
3222 'TV-Y7': 7,
3223 'TV-G': 0,
3224 'TV-PG': 0,
3225 'TV-14': 14,
3226 'TV-MA': 17,
a8795327
S
3227}
3228
3229
146c80e2 3230def parse_age_limit(s):
19a03940 3231 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3232 if type(s) is int: # noqa: E721
a8795327 3233 return s if 0 <= s <= 21 else None
19a03940 3234 elif not isinstance(s, str):
d838b1bd 3235 return None
146c80e2 3236 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3237 if m:
3238 return int(m.group('age'))
5c5fae6d 3239 s = s.upper()
a8795327
S
3240 if s in US_RATINGS:
3241 return US_RATINGS[s]
5a16c9d9 3242 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3243 if m:
5a16c9d9 3244 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3245 return None
146c80e2
S
3246
3247
fac55558 3248def strip_jsonp(code):
609a61e3 3249 return re.sub(
5552c9eb 3250 r'''(?sx)^
e9c671d5 3251 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3252 (?:\s*&&\s*(?P=func_name))?
3253 \s*\(\s*(?P<callback_data>.*)\);?
3254 \s*?(?://[^\n]*)*$''',
3255 r'\g<callback_data>', code)
478c2c61
PH
3256
3257
8f53dc44 3258def js_to_json(code, vars={}, *, strict=False):
5c610515 3259 # vars is a dict of var, val pairs to substitute
c843e685 3260 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3261 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3262 INTEGER_TABLE = (
86e5f3ed 3263 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3264 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3265 )
3266
e05f6939 3267 def fix_kv(m):
e7b6d122
PH
3268 v = m.group(0)
3269 if v in ('true', 'false', 'null'):
3270 return v
421ddcb8
C
3271 elif v in ('undefined', 'void 0'):
3272 return 'null'
8bdd16b4 3273 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3274 return ""
3275
3276 if v[0] in ("'", '"'):
3277 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3278 '"': '\\"',
bd1e4844 3279 "\\'": "'",
3280 '\\\n': '',
3281 '\\x': '\\u00',
3282 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3283 else:
3284 for regex, base in INTEGER_TABLE:
3285 im = re.match(regex, v)
3286 if im:
3287 i = int(im.group(1), base)
3288 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3289
5c610515 3290 if v in vars:
3291 return vars[v]
8f53dc44 3292 if strict:
3293 raise ValueError(f'Unknown value: {v}')
5c610515 3294
e7b6d122 3295 return '"%s"' % v
e05f6939 3296
8072ef2b 3297 def create_map(mobj):
3298 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3299
8072ef2b 3300 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3301 if not strict:
3302 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
febff4c1 3303
bd1e4844 3304 return re.sub(r'''(?sx)
3305 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3306 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3307 {comment}|,(?={skip}[\]}}])|
421ddcb8 3308 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3309 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3310 [0-9]+(?={skip}:)|
3311 !+
4195096e 3312 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3313
3314
478c2c61
PH
3315def qualities(quality_ids):
3316 """ Get a numeric quality value out of a list of possible values """
3317 def q(qid):
3318 try:
3319 return quality_ids.index(qid)
3320 except ValueError:
3321 return -1
3322 return q
3323
acd69589 3324
8aa0e7cd 3325POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3326
3327
de6000d9 3328DEFAULT_OUTTMPL = {
3329 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3330 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3331}
3332OUTTMPL_TYPES = {
72755351 3333 'chapter': None,
de6000d9 3334 'subtitle': None,
3335 'thumbnail': None,
3336 'description': 'description',
3337 'annotation': 'annotations.xml',
3338 'infojson': 'info.json',
08438d2c 3339 'link': None,
3b603dbd 3340 'pl_video': None,
5112f26a 3341 'pl_thumbnail': None,
de6000d9 3342 'pl_description': 'description',
3343 'pl_infojson': 'info.json',
3344}
0a871f68 3345
143db31d 3346# As of [1] format syntax is:
3347# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3348# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3349STR_FORMAT_RE_TMPL = r'''(?x)
3350 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3351 %
524e2e4f 3352 (?P<has_key>\((?P<key>{0})\))?
752cda38 3353 (?P<format>
524e2e4f 3354 (?P<conversion>[#0\-+ ]+)?
3355 (?P<min_width>\d+)?
3356 (?P<precision>\.\d+)?
3357 (?P<len_mod>[hlL])? # unused in python
901130bb 3358 {1} # conversion type
752cda38 3359 )
143db31d 3360'''
3361
7d1eb38a 3362
901130bb 3363STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3364
7d1eb38a 3365
a020a0dc
PH
3366def limit_length(s, length):
3367 """ Add ellipses to overly long strings """
3368 if s is None:
3369 return None
3370 ELLIPSES = '...'
3371 if len(s) > length:
3372 return s[:length - len(ELLIPSES)] + ELLIPSES
3373 return s
48844745
PH
3374
3375
3376def version_tuple(v):
5f9b8394 3377 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3378
3379
3380def is_outdated_version(version, limit, assume_new=True):
3381 if not version:
3382 return not assume_new
3383 try:
3384 return version_tuple(version) < version_tuple(limit)
3385 except ValueError:
3386 return not assume_new
732ea2f0
PH
3387
3388
3389def ytdl_is_updateable():
7a5c1cfe 3390 """ Returns if yt-dlp can be updated with -U """
735d865e 3391
5d535b4a 3392 from .update import is_non_updateable
732ea2f0 3393
5d535b4a 3394 return not is_non_updateable()
7d4111ed
PH
3395
3396
3397def args_to_str(args):
3398 # Get a short string representation for a subprocess command
702ccf2d 3399 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3400
3401
9b9c5355 3402def error_to_compat_str(err):
cfb0511d 3403 return str(err)
fdae2358
S
3404
3405
a44ca5a4 3406def error_to_str(err):
3407 return f'{type(err).__name__}: {err}'
3408
3409
c460bdd5 3410def mimetype2ext(mt):
eb9ee194
S
3411 if mt is None:
3412 return None
3413
9359f3d4
F
3414 mt, _, params = mt.partition(';')
3415 mt = mt.strip()
3416
3417 FULL_MAP = {
765ac263 3418 'audio/mp4': 'm4a',
6c33d24b
YCH
3419 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3420 # it's the most popular one
3421 'audio/mpeg': 'mp3',
ba39289d 3422 'audio/x-wav': 'wav',
9359f3d4
F
3423 'audio/wav': 'wav',
3424 'audio/wave': 'wav',
3425 }
3426
3427 ext = FULL_MAP.get(mt)
765ac263
JMF
3428 if ext is not None:
3429 return ext
3430
9359f3d4 3431 SUBTYPE_MAP = {
f6861ec9 3432 '3gpp': '3gp',
cafcf657 3433 'smptett+xml': 'tt',
cafcf657 3434 'ttaf+xml': 'dfxp',
a0d8d704 3435 'ttml+xml': 'ttml',
f6861ec9 3436 'x-flv': 'flv',
a0d8d704 3437 'x-mp4-fragmented': 'mp4',
d4f05d47 3438 'x-ms-sami': 'sami',
a0d8d704 3439 'x-ms-wmv': 'wmv',
b4173f15
RA
3440 'mpegurl': 'm3u8',
3441 'x-mpegurl': 'm3u8',
3442 'vnd.apple.mpegurl': 'm3u8',
3443 'dash+xml': 'mpd',
b4173f15 3444 'f4m+xml': 'f4m',
f164b971 3445 'hds+xml': 'f4m',
e910fe2f 3446 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3447 'quicktime': 'mov',
98ce1a3f 3448 'mp2t': 'ts',
39e7107d 3449 'x-wav': 'wav',
9359f3d4
F
3450 'filmstrip+json': 'fs',
3451 'svg+xml': 'svg',
3452 }
3453
3454 _, _, subtype = mt.rpartition('/')
3455 ext = SUBTYPE_MAP.get(subtype.lower())
3456 if ext is not None:
3457 return ext
3458
3459 SUFFIX_MAP = {
3460 'json': 'json',
3461 'xml': 'xml',
3462 'zip': 'zip',
3463 'gzip': 'gz',
3464 }
3465
3466 _, _, suffix = subtype.partition('+')
3467 ext = SUFFIX_MAP.get(suffix)
3468 if ext is not None:
3469 return ext
3470
3471 return subtype.replace('+', '.')
c460bdd5
PH
3472
3473
2814f12b
THD
3474def ext2mimetype(ext_or_url):
3475 if not ext_or_url:
3476 return None
3477 if '.' not in ext_or_url:
3478 ext_or_url = f'file.{ext_or_url}'
3479 return mimetypes.guess_type(ext_or_url)[0]
3480
3481
4f3c5e06 3482def parse_codecs(codecs_str):
3483 # http://tools.ietf.org/html/rfc6381
3484 if not codecs_str:
3485 return {}
a0566bbf 3486 split_codecs = list(filter(None, map(
dbf5416a 3487 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3488 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3489 for full_codec in split_codecs:
d816f61f 3490 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3491 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3492 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3493 if vcodec:
3494 continue
3495 vcodec = full_codec
3496 if parts[0] in ('dvh1', 'dvhe'):
3497 hdr = 'DV'
3498 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3499 hdr = 'HDR10'
3500 elif parts[:2] == ['vp9', '2']:
3501 hdr = 'HDR10'
3502 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3503 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3504 acodec = acodec or full_codec
3505 elif parts[0] in ('stpp', 'wvtt'):
3506 scodec = scodec or full_codec
4f3c5e06 3507 else:
19a03940 3508 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3509 if vcodec or acodec or scodec:
4f3c5e06 3510 return {
3511 'vcodec': vcodec or 'none',
3512 'acodec': acodec or 'none',
176f1866 3513 'dynamic_range': hdr,
3fe75fdc 3514 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3515 }
b69fd25c 3516 elif len(split_codecs) == 2:
3517 return {
3518 'vcodec': split_codecs[0],
3519 'acodec': split_codecs[1],
3520 }
4f3c5e06 3521 return {}
3522
3523
fc61aff4
LL
3524def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3525 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3526
3527 allow_mkv = not preferences or 'mkv' in preferences
3528
3529 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3530 return 'mkv' # TODO: any other format allows this?
3531
3532 # TODO: All codecs supported by parse_codecs isn't handled here
3533 COMPATIBLE_CODECS = {
3534 'mp4': {
3535 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
3536 'h264', 'aacl', # Set in ISM
3537 },
3538 'webm': {
3539 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3540 'vp9x', 'vp8x', # in the webm spec
3541 },
3542 }
3543
8f84770a 3544 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3545 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3546
3547 for ext in preferences or COMPATIBLE_CODECS.keys():
3548 codec_set = COMPATIBLE_CODECS.get(ext, set())
3549 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3550 return ext
3551
3552 COMPATIBLE_EXTS = (
3553 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3554 {'webm'},
3555 )
3556 for ext in preferences or vexts:
3557 current_exts = {ext, *vexts, *aexts}
3558 if ext == 'mkv' or current_exts == {ext} or any(
3559 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3560 return ext
3561 return 'mkv' if allow_mkv else preferences[-1]
3562
3563
2ccd1b10 3564def urlhandle_detect_ext(url_handle):
79298173 3565 getheader = url_handle.headers.get
2ccd1b10 3566
b55ee18f
PH
3567 cd = getheader('Content-Disposition')
3568 if cd:
3569 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3570 if m:
3571 e = determine_ext(m.group('filename'), default_ext=None)
3572 if e:
3573 return e
3574
c460bdd5 3575 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3576
3577
1e399778
YCH
3578def encode_data_uri(data, mime_type):
3579 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3580
3581
05900629 3582def age_restricted(content_limit, age_limit):
6ec6cb4e 3583 """ Returns True iff the content should be blocked """
05900629
PH
3584
3585 if age_limit is None: # No limit set
3586 return False
3587 if content_limit is None:
3588 return False # Content available for everyone
3589 return age_limit < content_limit
61ca9a80
PH
3590
3591
88f60feb 3592# List of known byte-order-marks (BOM)
a904a7f8
L
3593BOMS = [
3594 (b'\xef\xbb\xbf', 'utf-8'),
3595 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3596 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3597 (b'\xff\xfe', 'utf-16-le'),
3598 (b'\xfe\xff', 'utf-16-be'),
3599]
a904a7f8
L
3600
3601
61ca9a80
PH
3602def is_html(first_bytes):
3603 """ Detect whether a file contains HTML by examining its first bytes. """
3604
80e8493e 3605 encoding = 'utf-8'
61ca9a80 3606 for bom, enc in BOMS:
80e8493e 3607 while first_bytes.startswith(bom):
3608 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3609
80e8493e 3610 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3611
3612
3613def determine_protocol(info_dict):
3614 protocol = info_dict.get('protocol')
3615 if protocol is not None:
3616 return protocol
3617
7de837a5 3618 url = sanitize_url(info_dict['url'])
a055469f
PH
3619 if url.startswith('rtmp'):
3620 return 'rtmp'
3621 elif url.startswith('mms'):
3622 return 'mms'
3623 elif url.startswith('rtsp'):
3624 return 'rtsp'
3625
3626 ext = determine_ext(url)
3627 if ext == 'm3u8':
3628 return 'm3u8'
3629 elif ext == 'f4m':
3630 return 'f4m'
3631
14f25df2 3632 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3633
3634
c5e3f849 3635def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3636 """ Render a list of rows, each as a list of values.
3637 Text after a \t will be right aligned """
ec11a9f4 3638 def width(string):
c5e3f849 3639 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3640
3641 def get_max_lens(table):
ec11a9f4 3642 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3643
3644 def filter_using_list(row, filterArray):
d16df59d 3645 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3646
d16df59d 3647 max_lens = get_max_lens(data) if hide_empty else []
3648 header_row = filter_using_list(header_row, max_lens)
3649 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3650
cfb56d1a 3651 table = [header_row] + data
76d321f6 3652 max_lens = get_max_lens(table)
c5e3f849 3653 extra_gap += 1
76d321f6 3654 if delim:
c5e3f849 3655 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3656 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3657 for row in table:
3658 for pos, text in enumerate(map(str, row)):
c5e3f849 3659 if '\t' in text:
3660 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3661 else:
3662 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3663 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3664 return ret
347de493
PH
3665
3666
8f18aca8 3667def _match_one(filter_part, dct, incomplete):
77b87f05 3668 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3669 STRING_OPERATORS = {
3670 '*=': operator.contains,
3671 '^=': lambda attr, value: attr.startswith(value),
3672 '$=': lambda attr, value: attr.endswith(value),
3673 '~=': lambda attr, value: re.search(value, attr),
3674 }
347de493 3675 COMPARISON_OPERATORS = {
a047eeb6 3676 **STRING_OPERATORS,
3677 '<=': operator.le, # "<=" must be defined above "<"
347de493 3678 '<': operator.lt,
347de493 3679 '>=': operator.ge,
a047eeb6 3680 '>': operator.gt,
347de493 3681 '=': operator.eq,
347de493 3682 }
a047eeb6 3683
6db9c4d5 3684 if isinstance(incomplete, bool):
3685 is_incomplete = lambda _: incomplete
3686 else:
3687 is_incomplete = lambda k: k in incomplete
3688
64fa820c 3689 operator_rex = re.compile(r'''(?x)
347de493 3690 (?P<key>[a-z_]+)
77b87f05 3691 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3692 (?:
a047eeb6 3693 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3694 (?P<strval>.+?)
347de493 3695 )
347de493 3696 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3697 m = operator_rex.fullmatch(filter_part.strip())
347de493 3698 if m:
18f96d12 3699 m = m.groupdict()
3700 unnegated_op = COMPARISON_OPERATORS[m['op']]
3701 if m['negation']:
77b87f05
MT
3702 op = lambda attr, value: not unnegated_op(attr, value)
3703 else:
3704 op = unnegated_op
18f96d12 3705 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3706 if m['quote']:
3707 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3708 actual_value = dct.get(m['key'])
3709 numeric_comparison = None
f9934b96 3710 if isinstance(actual_value, (int, float)):
e5a088dc
S
3711 # If the original field is a string and matching comparisonvalue is
3712 # a number we should respect the origin of the original field
3713 # and process comparison value as a string (see
18f96d12 3714 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3715 try:
18f96d12 3716 numeric_comparison = int(comparison_value)
347de493 3717 except ValueError:
18f96d12 3718 numeric_comparison = parse_filesize(comparison_value)
3719 if numeric_comparison is None:
3720 numeric_comparison = parse_filesize(f'{comparison_value}B')
3721 if numeric_comparison is None:
3722 numeric_comparison = parse_duration(comparison_value)
3723 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3724 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3725 if actual_value is None:
6db9c4d5 3726 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3727 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3728
3729 UNARY_OPERATORS = {
1cc47c66
S
3730 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3731 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3732 }
64fa820c 3733 operator_rex = re.compile(r'''(?x)
347de493 3734 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3735 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3736 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3737 if m:
3738 op = UNARY_OPERATORS[m.group('op')]
3739 actual_value = dct.get(m.group('key'))
6db9c4d5 3740 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3741 return True
347de493
PH
3742 return op(actual_value)
3743
3744 raise ValueError('Invalid filter part %r' % filter_part)
3745
3746
8f18aca8 3747def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3748 """ Filter a dictionary with a simple string syntax.
3749 @returns Whether the filter passes
3750 @param incomplete Set of keys that is expected to be missing from dct.
3751 Can be True/False to indicate all/none of the keys may be missing.
3752 All conditions on incomplete keys pass if the key is missing
8f18aca8 3753 """
347de493 3754 return all(
8f18aca8 3755 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3756 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3757
3758
b1a7cd05 3759def match_filter_func(filters):
3760 if not filters:
d1b5f70b 3761 return None
492272fe 3762 filters = set(variadic(filters))
d1b5f70b 3763
492272fe 3764 interactive = '-' in filters
3765 if interactive:
3766 filters.remove('-')
3767
3768 def _match_func(info_dict, incomplete=False):
3769 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3770 return NO_DEFAULT if interactive and not incomplete else None
347de493 3771 else:
3bec830a 3772 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3773 filter_str = ') | ('.join(map(str.strip, filters))
3774 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3775 return _match_func
91410c9b
PH
3776
3777
f2df4071 3778class download_range_func:
3779 def __init__(self, chapters, ranges):
3780 self.chapters, self.ranges = chapters, ranges
3781
3782 def __call__(self, info_dict, ydl):
5ec1b6b7 3783 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3784 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3785 for regex in self.chapters or []:
5ec1b6b7 3786 for i, chapter in enumerate(info_dict.get('chapters') or []):
3787 if re.search(regex, chapter['title']):
3788 warning = None
3789 yield {**chapter, 'index': i}
f2df4071 3790 if self.chapters and warning:
5ec1b6b7 3791 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3792
f2df4071 3793 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3794
f2df4071 3795 def __eq__(self, other):
3796 return (isinstance(other, download_range_func)
3797 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3798
3799
bf6427d2
YCH
3800def parse_dfxp_time_expr(time_expr):
3801 if not time_expr:
d631d5f9 3802 return
bf6427d2 3803
1d485a1a 3804 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3805 if mobj:
3806 return float(mobj.group('time_offset'))
3807
db2fe38b 3808 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3809 if mobj:
db2fe38b 3810 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3811
3812
c1c924ab 3813def srt_subtitles_timecode(seconds):
aa7785f8 3814 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3815
3816
3817def ass_subtitles_timecode(seconds):
3818 time = timetuple_from_msec(seconds * 1000)
3819 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3820
3821
3822def dfxp2srt(dfxp_data):
3869028f
YCH
3823 '''
3824 @param dfxp_data A bytes-like object containing DFXP data
3825 @returns A unicode object containing converted SRT data
3826 '''
5b995f71 3827 LEGACY_NAMESPACES = (
3869028f
YCH
3828 (b'http://www.w3.org/ns/ttml', [
3829 b'http://www.w3.org/2004/11/ttaf1',
3830 b'http://www.w3.org/2006/04/ttaf1',
3831 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3832 ]),
3869028f
YCH
3833 (b'http://www.w3.org/ns/ttml#styling', [
3834 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3835 ]),
3836 )
3837
3838 SUPPORTED_STYLING = [
3839 'color',
3840 'fontFamily',
3841 'fontSize',
3842 'fontStyle',
3843 'fontWeight',
3844 'textDecoration'
3845 ]
3846
4e335771 3847 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3848 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3849 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3850 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3851 })
bf6427d2 3852
5b995f71
RA
3853 styles = {}
3854 default_style = {}
3855
86e5f3ed 3856 class TTMLPElementParser:
5b995f71
RA
3857 _out = ''
3858 _unclosed_elements = []
3859 _applied_styles = []
bf6427d2 3860
2b14cb56 3861 def start(self, tag, attrib):
5b995f71
RA
3862 if tag in (_x('ttml:br'), 'br'):
3863 self._out += '\n'
3864 else:
3865 unclosed_elements = []
3866 style = {}
3867 element_style_id = attrib.get('style')
3868 if default_style:
3869 style.update(default_style)
3870 if element_style_id:
3871 style.update(styles.get(element_style_id, {}))
3872 for prop in SUPPORTED_STYLING:
3873 prop_val = attrib.get(_x('tts:' + prop))
3874 if prop_val:
3875 style[prop] = prop_val
3876 if style:
3877 font = ''
3878 for k, v in sorted(style.items()):
3879 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3880 continue
3881 if k == 'color':
3882 font += ' color="%s"' % v
3883 elif k == 'fontSize':
3884 font += ' size="%s"' % v
3885 elif k == 'fontFamily':
3886 font += ' face="%s"' % v
3887 elif k == 'fontWeight' and v == 'bold':
3888 self._out += '<b>'
3889 unclosed_elements.append('b')
3890 elif k == 'fontStyle' and v == 'italic':
3891 self._out += '<i>'
3892 unclosed_elements.append('i')
3893 elif k == 'textDecoration' and v == 'underline':
3894 self._out += '<u>'
3895 unclosed_elements.append('u')
3896 if font:
3897 self._out += '<font' + font + '>'
3898 unclosed_elements.append('font')
3899 applied_style = {}
3900 if self._applied_styles:
3901 applied_style.update(self._applied_styles[-1])
3902 applied_style.update(style)
3903 self._applied_styles.append(applied_style)
3904 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3905
2b14cb56 3906 def end(self, tag):
5b995f71
RA
3907 if tag not in (_x('ttml:br'), 'br'):
3908 unclosed_elements = self._unclosed_elements.pop()
3909 for element in reversed(unclosed_elements):
3910 self._out += '</%s>' % element
3911 if unclosed_elements and self._applied_styles:
3912 self._applied_styles.pop()
bf6427d2 3913
2b14cb56 3914 def data(self, data):
5b995f71 3915 self._out += data
2b14cb56 3916
3917 def close(self):
5b995f71 3918 return self._out.strip()
2b14cb56 3919
3920 def parse_node(node):
3921 target = TTMLPElementParser()
3922 parser = xml.etree.ElementTree.XMLParser(target=target)
3923 parser.feed(xml.etree.ElementTree.tostring(node))
3924 return parser.close()
bf6427d2 3925
5b995f71
RA
3926 for k, v in LEGACY_NAMESPACES:
3927 for ns in v:
3928 dfxp_data = dfxp_data.replace(ns, k)
3929
3869028f 3930 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3931 out = []
5b995f71 3932 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3933
3934 if not paras:
3935 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3936
5b995f71
RA
3937 repeat = False
3938 while True:
3939 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3940 style_id = style.get('id') or style.get(_x('xml:id'))
3941 if not style_id:
3942 continue
5b995f71
RA
3943 parent_style_id = style.get('style')
3944 if parent_style_id:
3945 if parent_style_id not in styles:
3946 repeat = True
3947 continue
3948 styles[style_id] = styles[parent_style_id].copy()
3949 for prop in SUPPORTED_STYLING:
3950 prop_val = style.get(_x('tts:' + prop))
3951 if prop_val:
3952 styles.setdefault(style_id, {})[prop] = prop_val
3953 if repeat:
3954 repeat = False
3955 else:
3956 break
3957
3958 for p in ('body', 'div'):
3959 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3960 if ele is None:
3961 continue
3962 style = styles.get(ele.get('style'))
3963 if not style:
3964 continue
3965 default_style.update(style)
3966
bf6427d2 3967 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3968 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3969 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3970 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3971 if begin_time is None:
3972 continue
7dff0363 3973 if not end_time:
d631d5f9
YCH
3974 if not dur:
3975 continue
3976 end_time = begin_time + dur
bf6427d2
YCH
3977 out.append('%d\n%s --> %s\n%s\n\n' % (
3978 index,
c1c924ab
YCH
3979 srt_subtitles_timecode(begin_time),
3980 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3981 parse_node(para)))
3982
3983 return ''.join(out)
3984
3985
c487cf00 3986def cli_option(params, command_option, param, separator=None):
66e289ba 3987 param = params.get(param)
c487cf00 3988 return ([] if param is None
3989 else [command_option, str(param)] if separator is None
3990 else [f'{command_option}{separator}{param}'])
66e289ba
S
3991
3992
3993def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3994 param = params.get(param)
c487cf00 3995 assert param in (True, False, None)
3996 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3997
3998
3999def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4000 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4001
4002
e92caff5 4003def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4004 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4005 if use_compat:
5b1ecbb3 4006 return argdict
4007 else:
4008 argdict = None
eab9b2bc 4009 if argdict is None:
5b1ecbb3 4010 return default
eab9b2bc 4011 assert isinstance(argdict, dict)
4012
e92caff5 4013 assert isinstance(keys, (list, tuple))
4014 for key_list in keys:
e92caff5 4015 arg_list = list(filter(
4016 lambda x: x is not None,
6606817a 4017 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4018 if arg_list:
4019 return [arg for args in arg_list for arg in args]
4020 return default
66e289ba 4021
6251555f 4022
330690a2 4023def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4024 main_key, exe = main_key.lower(), exe.lower()
4025 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4026 keys = [f'{root_key}{k}' for k in (keys or [''])]
4027 if root_key in keys:
4028 if main_key != exe:
4029 keys.append((main_key, exe))
4030 keys.append('default')
4031 else:
4032 use_compat = False
4033 return cli_configuration_args(argdict, keys, default, use_compat)
4034
66e289ba 4035
86e5f3ed 4036class ISO639Utils:
39672624
YCH
4037 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4038 _lang_map = {
4039 'aa': 'aar',
4040 'ab': 'abk',
4041 'ae': 'ave',
4042 'af': 'afr',
4043 'ak': 'aka',
4044 'am': 'amh',
4045 'an': 'arg',
4046 'ar': 'ara',
4047 'as': 'asm',
4048 'av': 'ava',
4049 'ay': 'aym',
4050 'az': 'aze',
4051 'ba': 'bak',
4052 'be': 'bel',
4053 'bg': 'bul',
4054 'bh': 'bih',
4055 'bi': 'bis',
4056 'bm': 'bam',
4057 'bn': 'ben',
4058 'bo': 'bod',
4059 'br': 'bre',
4060 'bs': 'bos',
4061 'ca': 'cat',
4062 'ce': 'che',
4063 'ch': 'cha',
4064 'co': 'cos',
4065 'cr': 'cre',
4066 'cs': 'ces',
4067 'cu': 'chu',
4068 'cv': 'chv',
4069 'cy': 'cym',
4070 'da': 'dan',
4071 'de': 'deu',
4072 'dv': 'div',
4073 'dz': 'dzo',
4074 'ee': 'ewe',
4075 'el': 'ell',
4076 'en': 'eng',
4077 'eo': 'epo',
4078 'es': 'spa',
4079 'et': 'est',
4080 'eu': 'eus',
4081 'fa': 'fas',
4082 'ff': 'ful',
4083 'fi': 'fin',
4084 'fj': 'fij',
4085 'fo': 'fao',
4086 'fr': 'fra',
4087 'fy': 'fry',
4088 'ga': 'gle',
4089 'gd': 'gla',
4090 'gl': 'glg',
4091 'gn': 'grn',
4092 'gu': 'guj',
4093 'gv': 'glv',
4094 'ha': 'hau',
4095 'he': 'heb',
b7acc835 4096 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4097 'hi': 'hin',
4098 'ho': 'hmo',
4099 'hr': 'hrv',
4100 'ht': 'hat',
4101 'hu': 'hun',
4102 'hy': 'hye',
4103 'hz': 'her',
4104 'ia': 'ina',
4105 'id': 'ind',
b7acc835 4106 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4107 'ie': 'ile',
4108 'ig': 'ibo',
4109 'ii': 'iii',
4110 'ik': 'ipk',
4111 'io': 'ido',
4112 'is': 'isl',
4113 'it': 'ita',
4114 'iu': 'iku',
4115 'ja': 'jpn',
4116 'jv': 'jav',
4117 'ka': 'kat',
4118 'kg': 'kon',
4119 'ki': 'kik',
4120 'kj': 'kua',
4121 'kk': 'kaz',
4122 'kl': 'kal',
4123 'km': 'khm',
4124 'kn': 'kan',
4125 'ko': 'kor',
4126 'kr': 'kau',
4127 'ks': 'kas',
4128 'ku': 'kur',
4129 'kv': 'kom',
4130 'kw': 'cor',
4131 'ky': 'kir',
4132 'la': 'lat',
4133 'lb': 'ltz',
4134 'lg': 'lug',
4135 'li': 'lim',
4136 'ln': 'lin',
4137 'lo': 'lao',
4138 'lt': 'lit',
4139 'lu': 'lub',
4140 'lv': 'lav',
4141 'mg': 'mlg',
4142 'mh': 'mah',
4143 'mi': 'mri',
4144 'mk': 'mkd',
4145 'ml': 'mal',
4146 'mn': 'mon',
4147 'mr': 'mar',
4148 'ms': 'msa',
4149 'mt': 'mlt',
4150 'my': 'mya',
4151 'na': 'nau',
4152 'nb': 'nob',
4153 'nd': 'nde',
4154 'ne': 'nep',
4155 'ng': 'ndo',
4156 'nl': 'nld',
4157 'nn': 'nno',
4158 'no': 'nor',
4159 'nr': 'nbl',
4160 'nv': 'nav',
4161 'ny': 'nya',
4162 'oc': 'oci',
4163 'oj': 'oji',
4164 'om': 'orm',
4165 'or': 'ori',
4166 'os': 'oss',
4167 'pa': 'pan',
4168 'pi': 'pli',
4169 'pl': 'pol',
4170 'ps': 'pus',
4171 'pt': 'por',
4172 'qu': 'que',
4173 'rm': 'roh',
4174 'rn': 'run',
4175 'ro': 'ron',
4176 'ru': 'rus',
4177 'rw': 'kin',
4178 'sa': 'san',
4179 'sc': 'srd',
4180 'sd': 'snd',
4181 'se': 'sme',
4182 'sg': 'sag',
4183 'si': 'sin',
4184 'sk': 'slk',
4185 'sl': 'slv',
4186 'sm': 'smo',
4187 'sn': 'sna',
4188 'so': 'som',
4189 'sq': 'sqi',
4190 'sr': 'srp',
4191 'ss': 'ssw',
4192 'st': 'sot',
4193 'su': 'sun',
4194 'sv': 'swe',
4195 'sw': 'swa',
4196 'ta': 'tam',
4197 'te': 'tel',
4198 'tg': 'tgk',
4199 'th': 'tha',
4200 'ti': 'tir',
4201 'tk': 'tuk',
4202 'tl': 'tgl',
4203 'tn': 'tsn',
4204 'to': 'ton',
4205 'tr': 'tur',
4206 'ts': 'tso',
4207 'tt': 'tat',
4208 'tw': 'twi',
4209 'ty': 'tah',
4210 'ug': 'uig',
4211 'uk': 'ukr',
4212 'ur': 'urd',
4213 'uz': 'uzb',
4214 've': 'ven',
4215 'vi': 'vie',
4216 'vo': 'vol',
4217 'wa': 'wln',
4218 'wo': 'wol',
4219 'xh': 'xho',
4220 'yi': 'yid',
e9a50fba 4221 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4222 'yo': 'yor',
4223 'za': 'zha',
4224 'zh': 'zho',
4225 'zu': 'zul',
4226 }
4227
4228 @classmethod
4229 def short2long(cls, code):
4230 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4231 return cls._lang_map.get(code[:2])
4232
4233 @classmethod
4234 def long2short(cls, code):
4235 """Convert language code from ISO 639-2/T to ISO 639-1"""
4236 for short_name, long_name in cls._lang_map.items():
4237 if long_name == code:
4238 return short_name
4239
4240
86e5f3ed 4241class ISO3166Utils:
4eb10f66
YCH
4242 # From http://data.okfn.org/data/core/country-list
4243 _country_map = {
4244 'AF': 'Afghanistan',
4245 'AX': 'Åland Islands',
4246 'AL': 'Albania',
4247 'DZ': 'Algeria',
4248 'AS': 'American Samoa',
4249 'AD': 'Andorra',
4250 'AO': 'Angola',
4251 'AI': 'Anguilla',
4252 'AQ': 'Antarctica',
4253 'AG': 'Antigua and Barbuda',
4254 'AR': 'Argentina',
4255 'AM': 'Armenia',
4256 'AW': 'Aruba',
4257 'AU': 'Australia',
4258 'AT': 'Austria',
4259 'AZ': 'Azerbaijan',
4260 'BS': 'Bahamas',
4261 'BH': 'Bahrain',
4262 'BD': 'Bangladesh',
4263 'BB': 'Barbados',
4264 'BY': 'Belarus',
4265 'BE': 'Belgium',
4266 'BZ': 'Belize',
4267 'BJ': 'Benin',
4268 'BM': 'Bermuda',
4269 'BT': 'Bhutan',
4270 'BO': 'Bolivia, Plurinational State of',
4271 'BQ': 'Bonaire, Sint Eustatius and Saba',
4272 'BA': 'Bosnia and Herzegovina',
4273 'BW': 'Botswana',
4274 'BV': 'Bouvet Island',
4275 'BR': 'Brazil',
4276 'IO': 'British Indian Ocean Territory',
4277 'BN': 'Brunei Darussalam',
4278 'BG': 'Bulgaria',
4279 'BF': 'Burkina Faso',
4280 'BI': 'Burundi',
4281 'KH': 'Cambodia',
4282 'CM': 'Cameroon',
4283 'CA': 'Canada',
4284 'CV': 'Cape Verde',
4285 'KY': 'Cayman Islands',
4286 'CF': 'Central African Republic',
4287 'TD': 'Chad',
4288 'CL': 'Chile',
4289 'CN': 'China',
4290 'CX': 'Christmas Island',
4291 'CC': 'Cocos (Keeling) Islands',
4292 'CO': 'Colombia',
4293 'KM': 'Comoros',
4294 'CG': 'Congo',
4295 'CD': 'Congo, the Democratic Republic of the',
4296 'CK': 'Cook Islands',
4297 'CR': 'Costa Rica',
4298 'CI': 'Côte d\'Ivoire',
4299 'HR': 'Croatia',
4300 'CU': 'Cuba',
4301 'CW': 'Curaçao',
4302 'CY': 'Cyprus',
4303 'CZ': 'Czech Republic',
4304 'DK': 'Denmark',
4305 'DJ': 'Djibouti',
4306 'DM': 'Dominica',
4307 'DO': 'Dominican Republic',
4308 'EC': 'Ecuador',
4309 'EG': 'Egypt',
4310 'SV': 'El Salvador',
4311 'GQ': 'Equatorial Guinea',
4312 'ER': 'Eritrea',
4313 'EE': 'Estonia',
4314 'ET': 'Ethiopia',
4315 'FK': 'Falkland Islands (Malvinas)',
4316 'FO': 'Faroe Islands',
4317 'FJ': 'Fiji',
4318 'FI': 'Finland',
4319 'FR': 'France',
4320 'GF': 'French Guiana',
4321 'PF': 'French Polynesia',
4322 'TF': 'French Southern Territories',
4323 'GA': 'Gabon',
4324 'GM': 'Gambia',
4325 'GE': 'Georgia',
4326 'DE': 'Germany',
4327 'GH': 'Ghana',
4328 'GI': 'Gibraltar',
4329 'GR': 'Greece',
4330 'GL': 'Greenland',
4331 'GD': 'Grenada',
4332 'GP': 'Guadeloupe',
4333 'GU': 'Guam',
4334 'GT': 'Guatemala',
4335 'GG': 'Guernsey',
4336 'GN': 'Guinea',
4337 'GW': 'Guinea-Bissau',
4338 'GY': 'Guyana',
4339 'HT': 'Haiti',
4340 'HM': 'Heard Island and McDonald Islands',
4341 'VA': 'Holy See (Vatican City State)',
4342 'HN': 'Honduras',
4343 'HK': 'Hong Kong',
4344 'HU': 'Hungary',
4345 'IS': 'Iceland',
4346 'IN': 'India',
4347 'ID': 'Indonesia',
4348 'IR': 'Iran, Islamic Republic of',
4349 'IQ': 'Iraq',
4350 'IE': 'Ireland',
4351 'IM': 'Isle of Man',
4352 'IL': 'Israel',
4353 'IT': 'Italy',
4354 'JM': 'Jamaica',
4355 'JP': 'Japan',
4356 'JE': 'Jersey',
4357 'JO': 'Jordan',
4358 'KZ': 'Kazakhstan',
4359 'KE': 'Kenya',
4360 'KI': 'Kiribati',
4361 'KP': 'Korea, Democratic People\'s Republic of',
4362 'KR': 'Korea, Republic of',
4363 'KW': 'Kuwait',
4364 'KG': 'Kyrgyzstan',
4365 'LA': 'Lao People\'s Democratic Republic',
4366 'LV': 'Latvia',
4367 'LB': 'Lebanon',
4368 'LS': 'Lesotho',
4369 'LR': 'Liberia',
4370 'LY': 'Libya',
4371 'LI': 'Liechtenstein',
4372 'LT': 'Lithuania',
4373 'LU': 'Luxembourg',
4374 'MO': 'Macao',
4375 'MK': 'Macedonia, the Former Yugoslav Republic of',
4376 'MG': 'Madagascar',
4377 'MW': 'Malawi',
4378 'MY': 'Malaysia',
4379 'MV': 'Maldives',
4380 'ML': 'Mali',
4381 'MT': 'Malta',
4382 'MH': 'Marshall Islands',
4383 'MQ': 'Martinique',
4384 'MR': 'Mauritania',
4385 'MU': 'Mauritius',
4386 'YT': 'Mayotte',
4387 'MX': 'Mexico',
4388 'FM': 'Micronesia, Federated States of',
4389 'MD': 'Moldova, Republic of',
4390 'MC': 'Monaco',
4391 'MN': 'Mongolia',
4392 'ME': 'Montenegro',
4393 'MS': 'Montserrat',
4394 'MA': 'Morocco',
4395 'MZ': 'Mozambique',
4396 'MM': 'Myanmar',
4397 'NA': 'Namibia',
4398 'NR': 'Nauru',
4399 'NP': 'Nepal',
4400 'NL': 'Netherlands',
4401 'NC': 'New Caledonia',
4402 'NZ': 'New Zealand',
4403 'NI': 'Nicaragua',
4404 'NE': 'Niger',
4405 'NG': 'Nigeria',
4406 'NU': 'Niue',
4407 'NF': 'Norfolk Island',
4408 'MP': 'Northern Mariana Islands',
4409 'NO': 'Norway',
4410 'OM': 'Oman',
4411 'PK': 'Pakistan',
4412 'PW': 'Palau',
4413 'PS': 'Palestine, State of',
4414 'PA': 'Panama',
4415 'PG': 'Papua New Guinea',
4416 'PY': 'Paraguay',
4417 'PE': 'Peru',
4418 'PH': 'Philippines',
4419 'PN': 'Pitcairn',
4420 'PL': 'Poland',
4421 'PT': 'Portugal',
4422 'PR': 'Puerto Rico',
4423 'QA': 'Qatar',
4424 'RE': 'Réunion',
4425 'RO': 'Romania',
4426 'RU': 'Russian Federation',
4427 'RW': 'Rwanda',
4428 'BL': 'Saint Barthélemy',
4429 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4430 'KN': 'Saint Kitts and Nevis',
4431 'LC': 'Saint Lucia',
4432 'MF': 'Saint Martin (French part)',
4433 'PM': 'Saint Pierre and Miquelon',
4434 'VC': 'Saint Vincent and the Grenadines',
4435 'WS': 'Samoa',
4436 'SM': 'San Marino',
4437 'ST': 'Sao Tome and Principe',
4438 'SA': 'Saudi Arabia',
4439 'SN': 'Senegal',
4440 'RS': 'Serbia',
4441 'SC': 'Seychelles',
4442 'SL': 'Sierra Leone',
4443 'SG': 'Singapore',
4444 'SX': 'Sint Maarten (Dutch part)',
4445 'SK': 'Slovakia',
4446 'SI': 'Slovenia',
4447 'SB': 'Solomon Islands',
4448 'SO': 'Somalia',
4449 'ZA': 'South Africa',
4450 'GS': 'South Georgia and the South Sandwich Islands',
4451 'SS': 'South Sudan',
4452 'ES': 'Spain',
4453 'LK': 'Sri Lanka',
4454 'SD': 'Sudan',
4455 'SR': 'Suriname',
4456 'SJ': 'Svalbard and Jan Mayen',
4457 'SZ': 'Swaziland',
4458 'SE': 'Sweden',
4459 'CH': 'Switzerland',
4460 'SY': 'Syrian Arab Republic',
4461 'TW': 'Taiwan, Province of China',
4462 'TJ': 'Tajikistan',
4463 'TZ': 'Tanzania, United Republic of',
4464 'TH': 'Thailand',
4465 'TL': 'Timor-Leste',
4466 'TG': 'Togo',
4467 'TK': 'Tokelau',
4468 'TO': 'Tonga',
4469 'TT': 'Trinidad and Tobago',
4470 'TN': 'Tunisia',
4471 'TR': 'Turkey',
4472 'TM': 'Turkmenistan',
4473 'TC': 'Turks and Caicos Islands',
4474 'TV': 'Tuvalu',
4475 'UG': 'Uganda',
4476 'UA': 'Ukraine',
4477 'AE': 'United Arab Emirates',
4478 'GB': 'United Kingdom',
4479 'US': 'United States',
4480 'UM': 'United States Minor Outlying Islands',
4481 'UY': 'Uruguay',
4482 'UZ': 'Uzbekistan',
4483 'VU': 'Vanuatu',
4484 'VE': 'Venezuela, Bolivarian Republic of',
4485 'VN': 'Viet Nam',
4486 'VG': 'Virgin Islands, British',
4487 'VI': 'Virgin Islands, U.S.',
4488 'WF': 'Wallis and Futuna',
4489 'EH': 'Western Sahara',
4490 'YE': 'Yemen',
4491 'ZM': 'Zambia',
4492 'ZW': 'Zimbabwe',
2f97cc61 4493 # Not ISO 3166 codes, but used for IP blocks
4494 'AP': 'Asia/Pacific Region',
4495 'EU': 'Europe',
4eb10f66
YCH
4496 }
4497
4498 @classmethod
4499 def short2full(cls, code):
4500 """Convert an ISO 3166-2 country code to the corresponding full name"""
4501 return cls._country_map.get(code.upper())
4502
4503
86e5f3ed 4504class GeoUtils:
773f291d
S
4505 # Major IPv4 address blocks per country
4506 _country_ip_map = {
53896ca5 4507 'AD': '46.172.224.0/19',
773f291d
S
4508 'AE': '94.200.0.0/13',
4509 'AF': '149.54.0.0/17',
4510 'AG': '209.59.64.0/18',
4511 'AI': '204.14.248.0/21',
4512 'AL': '46.99.0.0/16',
4513 'AM': '46.70.0.0/15',
4514 'AO': '105.168.0.0/13',
53896ca5
S
4515 'AP': '182.50.184.0/21',
4516 'AQ': '23.154.160.0/24',
773f291d
S
4517 'AR': '181.0.0.0/12',
4518 'AS': '202.70.112.0/20',
53896ca5 4519 'AT': '77.116.0.0/14',
773f291d
S
4520 'AU': '1.128.0.0/11',
4521 'AW': '181.41.0.0/18',
53896ca5
S
4522 'AX': '185.217.4.0/22',
4523 'AZ': '5.197.0.0/16',
773f291d
S
4524 'BA': '31.176.128.0/17',
4525 'BB': '65.48.128.0/17',
4526 'BD': '114.130.0.0/16',
4527 'BE': '57.0.0.0/8',
53896ca5 4528 'BF': '102.178.0.0/15',
773f291d
S
4529 'BG': '95.42.0.0/15',
4530 'BH': '37.131.0.0/17',
4531 'BI': '154.117.192.0/18',
4532 'BJ': '137.255.0.0/16',
53896ca5 4533 'BL': '185.212.72.0/23',
773f291d
S
4534 'BM': '196.12.64.0/18',
4535 'BN': '156.31.0.0/16',
4536 'BO': '161.56.0.0/16',
4537 'BQ': '161.0.80.0/20',
53896ca5 4538 'BR': '191.128.0.0/12',
773f291d
S
4539 'BS': '24.51.64.0/18',
4540 'BT': '119.2.96.0/19',
4541 'BW': '168.167.0.0/16',
4542 'BY': '178.120.0.0/13',
4543 'BZ': '179.42.192.0/18',
4544 'CA': '99.224.0.0/11',
4545 'CD': '41.243.0.0/16',
53896ca5
S
4546 'CF': '197.242.176.0/21',
4547 'CG': '160.113.0.0/16',
773f291d 4548 'CH': '85.0.0.0/13',
53896ca5 4549 'CI': '102.136.0.0/14',
773f291d
S
4550 'CK': '202.65.32.0/19',
4551 'CL': '152.172.0.0/14',
53896ca5 4552 'CM': '102.244.0.0/14',
773f291d
S
4553 'CN': '36.128.0.0/10',
4554 'CO': '181.240.0.0/12',
4555 'CR': '201.192.0.0/12',
4556 'CU': '152.206.0.0/15',
4557 'CV': '165.90.96.0/19',
4558 'CW': '190.88.128.0/17',
53896ca5 4559 'CY': '31.153.0.0/16',
773f291d
S
4560 'CZ': '88.100.0.0/14',
4561 'DE': '53.0.0.0/8',
4562 'DJ': '197.241.0.0/17',
4563 'DK': '87.48.0.0/12',
4564 'DM': '192.243.48.0/20',
4565 'DO': '152.166.0.0/15',
4566 'DZ': '41.96.0.0/12',
4567 'EC': '186.68.0.0/15',
4568 'EE': '90.190.0.0/15',
4569 'EG': '156.160.0.0/11',
4570 'ER': '196.200.96.0/20',
4571 'ES': '88.0.0.0/11',
4572 'ET': '196.188.0.0/14',
4573 'EU': '2.16.0.0/13',
4574 'FI': '91.152.0.0/13',
4575 'FJ': '144.120.0.0/16',
53896ca5 4576 'FK': '80.73.208.0/21',
773f291d
S
4577 'FM': '119.252.112.0/20',
4578 'FO': '88.85.32.0/19',
4579 'FR': '90.0.0.0/9',
4580 'GA': '41.158.0.0/15',
4581 'GB': '25.0.0.0/8',
4582 'GD': '74.122.88.0/21',
4583 'GE': '31.146.0.0/16',
4584 'GF': '161.22.64.0/18',
4585 'GG': '62.68.160.0/19',
53896ca5
S
4586 'GH': '154.160.0.0/12',
4587 'GI': '95.164.0.0/16',
773f291d
S
4588 'GL': '88.83.0.0/19',
4589 'GM': '160.182.0.0/15',
4590 'GN': '197.149.192.0/18',
4591 'GP': '104.250.0.0/19',
4592 'GQ': '105.235.224.0/20',
4593 'GR': '94.64.0.0/13',
4594 'GT': '168.234.0.0/16',
4595 'GU': '168.123.0.0/16',
4596 'GW': '197.214.80.0/20',
4597 'GY': '181.41.64.0/18',
4598 'HK': '113.252.0.0/14',
4599 'HN': '181.210.0.0/16',
4600 'HR': '93.136.0.0/13',
4601 'HT': '148.102.128.0/17',
4602 'HU': '84.0.0.0/14',
4603 'ID': '39.192.0.0/10',
4604 'IE': '87.32.0.0/12',
4605 'IL': '79.176.0.0/13',
4606 'IM': '5.62.80.0/20',
4607 'IN': '117.192.0.0/10',
4608 'IO': '203.83.48.0/21',
4609 'IQ': '37.236.0.0/14',
4610 'IR': '2.176.0.0/12',
4611 'IS': '82.221.0.0/16',
4612 'IT': '79.0.0.0/10',
4613 'JE': '87.244.64.0/18',
4614 'JM': '72.27.0.0/17',
4615 'JO': '176.29.0.0/16',
53896ca5 4616 'JP': '133.0.0.0/8',
773f291d
S
4617 'KE': '105.48.0.0/12',
4618 'KG': '158.181.128.0/17',
4619 'KH': '36.37.128.0/17',
4620 'KI': '103.25.140.0/22',
4621 'KM': '197.255.224.0/20',
53896ca5 4622 'KN': '198.167.192.0/19',
773f291d
S
4623 'KP': '175.45.176.0/22',
4624 'KR': '175.192.0.0/10',
4625 'KW': '37.36.0.0/14',
4626 'KY': '64.96.0.0/15',
4627 'KZ': '2.72.0.0/13',
4628 'LA': '115.84.64.0/18',
4629 'LB': '178.135.0.0/16',
53896ca5 4630 'LC': '24.92.144.0/20',
773f291d
S
4631 'LI': '82.117.0.0/19',
4632 'LK': '112.134.0.0/15',
53896ca5 4633 'LR': '102.183.0.0/16',
773f291d
S
4634 'LS': '129.232.0.0/17',
4635 'LT': '78.56.0.0/13',
4636 'LU': '188.42.0.0/16',
4637 'LV': '46.109.0.0/16',
4638 'LY': '41.252.0.0/14',
4639 'MA': '105.128.0.0/11',
4640 'MC': '88.209.64.0/18',
4641 'MD': '37.246.0.0/16',
4642 'ME': '178.175.0.0/17',
4643 'MF': '74.112.232.0/21',
4644 'MG': '154.126.0.0/17',
4645 'MH': '117.103.88.0/21',
4646 'MK': '77.28.0.0/15',
4647 'ML': '154.118.128.0/18',
4648 'MM': '37.111.0.0/17',
4649 'MN': '49.0.128.0/17',
4650 'MO': '60.246.0.0/16',
4651 'MP': '202.88.64.0/20',
4652 'MQ': '109.203.224.0/19',
4653 'MR': '41.188.64.0/18',
4654 'MS': '208.90.112.0/22',
4655 'MT': '46.11.0.0/16',
4656 'MU': '105.16.0.0/12',
4657 'MV': '27.114.128.0/18',
53896ca5 4658 'MW': '102.70.0.0/15',
773f291d
S
4659 'MX': '187.192.0.0/11',
4660 'MY': '175.136.0.0/13',
4661 'MZ': '197.218.0.0/15',
4662 'NA': '41.182.0.0/16',
4663 'NC': '101.101.0.0/18',
4664 'NE': '197.214.0.0/18',
4665 'NF': '203.17.240.0/22',
4666 'NG': '105.112.0.0/12',
4667 'NI': '186.76.0.0/15',
4668 'NL': '145.96.0.0/11',
4669 'NO': '84.208.0.0/13',
4670 'NP': '36.252.0.0/15',
4671 'NR': '203.98.224.0/19',
4672 'NU': '49.156.48.0/22',
4673 'NZ': '49.224.0.0/14',
4674 'OM': '5.36.0.0/15',
4675 'PA': '186.72.0.0/15',
4676 'PE': '186.160.0.0/14',
4677 'PF': '123.50.64.0/18',
4678 'PG': '124.240.192.0/19',
4679 'PH': '49.144.0.0/13',
4680 'PK': '39.32.0.0/11',
4681 'PL': '83.0.0.0/11',
4682 'PM': '70.36.0.0/20',
4683 'PR': '66.50.0.0/16',
4684 'PS': '188.161.0.0/16',
4685 'PT': '85.240.0.0/13',
4686 'PW': '202.124.224.0/20',
4687 'PY': '181.120.0.0/14',
4688 'QA': '37.210.0.0/15',
53896ca5 4689 'RE': '102.35.0.0/16',
773f291d 4690 'RO': '79.112.0.0/13',
53896ca5 4691 'RS': '93.86.0.0/15',
773f291d 4692 'RU': '5.136.0.0/13',
53896ca5 4693 'RW': '41.186.0.0/16',
773f291d
S
4694 'SA': '188.48.0.0/13',
4695 'SB': '202.1.160.0/19',
4696 'SC': '154.192.0.0/11',
53896ca5 4697 'SD': '102.120.0.0/13',
773f291d 4698 'SE': '78.64.0.0/12',
53896ca5 4699 'SG': '8.128.0.0/10',
773f291d
S
4700 'SI': '188.196.0.0/14',
4701 'SK': '78.98.0.0/15',
53896ca5 4702 'SL': '102.143.0.0/17',
773f291d
S
4703 'SM': '89.186.32.0/19',
4704 'SN': '41.82.0.0/15',
53896ca5 4705 'SO': '154.115.192.0/18',
773f291d
S
4706 'SR': '186.179.128.0/17',
4707 'SS': '105.235.208.0/21',
4708 'ST': '197.159.160.0/19',
4709 'SV': '168.243.0.0/16',
4710 'SX': '190.102.0.0/20',
4711 'SY': '5.0.0.0/16',
4712 'SZ': '41.84.224.0/19',
4713 'TC': '65.255.48.0/20',
4714 'TD': '154.68.128.0/19',
4715 'TG': '196.168.0.0/14',
4716 'TH': '171.96.0.0/13',
4717 'TJ': '85.9.128.0/18',
4718 'TK': '27.96.24.0/21',
4719 'TL': '180.189.160.0/20',
4720 'TM': '95.85.96.0/19',
4721 'TN': '197.0.0.0/11',
4722 'TO': '175.176.144.0/21',
4723 'TR': '78.160.0.0/11',
4724 'TT': '186.44.0.0/15',
4725 'TV': '202.2.96.0/19',
4726 'TW': '120.96.0.0/11',
4727 'TZ': '156.156.0.0/14',
53896ca5
S
4728 'UA': '37.52.0.0/14',
4729 'UG': '102.80.0.0/13',
4730 'US': '6.0.0.0/8',
773f291d 4731 'UY': '167.56.0.0/13',
53896ca5 4732 'UZ': '84.54.64.0/18',
773f291d 4733 'VA': '212.77.0.0/19',
53896ca5 4734 'VC': '207.191.240.0/21',
773f291d 4735 'VE': '186.88.0.0/13',
53896ca5 4736 'VG': '66.81.192.0/20',
773f291d
S
4737 'VI': '146.226.0.0/16',
4738 'VN': '14.160.0.0/11',
4739 'VU': '202.80.32.0/20',
4740 'WF': '117.20.32.0/21',
4741 'WS': '202.4.32.0/19',
4742 'YE': '134.35.0.0/16',
4743 'YT': '41.242.116.0/22',
4744 'ZA': '41.0.0.0/11',
53896ca5
S
4745 'ZM': '102.144.0.0/13',
4746 'ZW': '102.177.192.0/18',
773f291d
S
4747 }
4748
4749 @classmethod
5f95927a
S
4750 def random_ipv4(cls, code_or_block):
4751 if len(code_or_block) == 2:
4752 block = cls._country_ip_map.get(code_or_block.upper())
4753 if not block:
4754 return None
4755 else:
4756 block = code_or_block
773f291d 4757 addr, preflen = block.split('/')
ac668111 4758 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4759 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4760 return str(socket.inet_ntoa(
ac668111 4761 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4762
4763
ac668111 4764class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4765 def __init__(self, proxies=None):
4766 # Set default handlers
4767 for type in ('http', 'https'):
4768 setattr(self, '%s_open' % type,
4769 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4770 meth(r, proxy, type))
ac668111 4771 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4772
91410c9b 4773 def proxy_open(self, req, proxy, type):
2461f79d 4774 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4775 if req_proxy is not None:
4776 proxy = req_proxy
2461f79d
PH
4777 del req.headers['Ytdl-request-proxy']
4778
4779 if proxy == '__noproxy__':
4780 return None # No Proxy
14f25df2 4781 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4782 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4783 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4784 return None
ac668111 4785 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4786 self, req, proxy, type)
5bc880b9
YCH
4787
4788
0a5445dd
YCH
4789# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4790# released into Public Domain
4791# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4792
4793def long_to_bytes(n, blocksize=0):
4794 """long_to_bytes(n:long, blocksize:int) : string
4795 Convert a long integer to a byte string.
4796
4797 If optional blocksize is given and greater than zero, pad the front of the
4798 byte string with binary zeros so that the length is a multiple of
4799 blocksize.
4800 """
4801 # after much testing, this algorithm was deemed to be the fastest
4802 s = b''
4803 n = int(n)
4804 while n > 0:
ac668111 4805 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4806 n = n >> 32
4807 # strip off leading zeros
4808 for i in range(len(s)):
4809 if s[i] != b'\000'[0]:
4810 break
4811 else:
4812 # only happens when n == 0
4813 s = b'\000'
4814 i = 0
4815 s = s[i:]
4816 # add back some pad bytes. this could be done more efficiently w.r.t. the
4817 # de-padding being done above, but sigh...
4818 if blocksize > 0 and len(s) % blocksize:
4819 s = (blocksize - len(s) % blocksize) * b'\000' + s
4820 return s
4821
4822
4823def bytes_to_long(s):
4824 """bytes_to_long(string) : long
4825 Convert a byte string to a long integer.
4826
4827 This is (essentially) the inverse of long_to_bytes().
4828 """
4829 acc = 0
4830 length = len(s)
4831 if length % 4:
4832 extra = (4 - length % 4)
4833 s = b'\000' * extra + s
4834 length = length + extra
4835 for i in range(0, length, 4):
ac668111 4836 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4837 return acc
4838
4839
5bc880b9
YCH
4840def ohdave_rsa_encrypt(data, exponent, modulus):
4841 '''
4842 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4843
4844 Input:
4845 data: data to encrypt, bytes-like object
4846 exponent, modulus: parameter e and N of RSA algorithm, both integer
4847 Output: hex string of encrypted data
4848
4849 Limitation: supports one block encryption only
4850 '''
4851
4852 payload = int(binascii.hexlify(data[::-1]), 16)
4853 encrypted = pow(payload, exponent, modulus)
4854 return '%x' % encrypted
81bdc8fd
YCH
4855
4856
f48409c7
YCH
4857def pkcs1pad(data, length):
4858 """
4859 Padding input data with PKCS#1 scheme
4860
4861 @param {int[]} data input data
4862 @param {int} length target length
4863 @returns {int[]} padded data
4864 """
4865 if len(data) > length - 11:
4866 raise ValueError('Input data too long for PKCS#1 padding')
4867
4868 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4869 return [0, 2] + pseudo_random + [0] + data
4870
4871
7b2c3f47 4872def _base_n_table(n, table):
4873 if not table and not n:
4874 raise ValueError('Either table or n must be specified')
612f2be5 4875 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4876
44f14eb4 4877 if n and n != len(table):
612f2be5 4878 raise ValueError(f'base {n} exceeds table length {len(table)}')
4879 return table
59f898b7 4880
5eb6bdce 4881
7b2c3f47 4882def encode_base_n(num, n=None, table=None):
4883 """Convert given int to a base-n string"""
612f2be5 4884 table = _base_n_table(n, table)
7b2c3f47 4885 if not num:
5eb6bdce
YCH
4886 return table[0]
4887
7b2c3f47 4888 result, base = '', len(table)
81bdc8fd 4889 while num:
7b2c3f47 4890 result = table[num % base] + result
612f2be5 4891 num = num // base
7b2c3f47 4892 return result
4893
4894
4895def decode_base_n(string, n=None, table=None):
4896 """Convert given base-n string to int"""
4897 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4898 result, base = 0, len(table)
4899 for char in string:
4900 result = result * base + table[char]
4901 return result
4902
4903
4904def decode_base(value, digits):
da4db748 4905 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4906 f'in a future version. Use {__name__}.decode_base_n instead')
7b2c3f47 4907 return decode_base_n(value, table=digits)
f52354a8
YCH
4908
4909
4910def decode_packed_codes(code):
06b3fe29 4911 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4912 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4913 base = int(base)
4914 count = int(count)
4915 symbols = symbols.split('|')
4916 symbol_table = {}
4917
4918 while count:
4919 count -= 1
5eb6bdce 4920 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4921 symbol_table[base_n_count] = symbols[count] or base_n_count
4922
4923 return re.sub(
4924 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4925 obfuscated_code)
e154c651 4926
4927
1ced2221
S
4928def caesar(s, alphabet, shift):
4929 if shift == 0:
4930 return s
4931 l = len(alphabet)
4932 return ''.join(
4933 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4934 for c in s)
4935
4936
4937def rot47(s):
4938 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4939
4940
e154c651 4941def parse_m3u8_attributes(attrib):
4942 info = {}
4943 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4944 if val.startswith('"'):
4945 val = val[1:-1]
4946 info[key] = val
4947 return info
1143535d
YCH
4948
4949
4950def urshift(val, n):
4951 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4952
4953
4954# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4955# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4956def decode_png(png_data):
4957 # Reference: https://www.w3.org/TR/PNG/
4958 header = png_data[8:]
4959
4960 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4961 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4962
4963 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 4964 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
4965
4966 chunks = []
4967
4968 while header:
4969 length = unpack_integer(header[:4])
4970 header = header[4:]
4971
4972 chunk_type = header[:4]
4973 header = header[4:]
4974
4975 chunk_data = header[:length]
4976 header = header[length:]
4977
4978 header = header[4:] # Skip CRC
4979
4980 chunks.append({
4981 'type': chunk_type,
4982 'length': length,
4983 'data': chunk_data
4984 })
4985
4986 ihdr = chunks[0]['data']
4987
4988 width = unpack_integer(ihdr[:4])
4989 height = unpack_integer(ihdr[4:8])
4990
4991 idat = b''
4992
4993 for chunk in chunks:
4994 if chunk['type'] == b'IDAT':
4995 idat += chunk['data']
4996
4997 if not idat:
86e5f3ed 4998 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4999
5000 decompressed_data = bytearray(zlib.decompress(idat))
5001
5002 stride = width * 3
5003 pixels = []
5004
5005 def _get_pixel(idx):
5006 x = idx % stride
5007 y = idx // stride
5008 return pixels[y][x]
5009
5010 for y in range(height):
5011 basePos = y * (1 + stride)
5012 filter_type = decompressed_data[basePos]
5013
5014 current_row = []
5015
5016 pixels.append(current_row)
5017
5018 for x in range(stride):
5019 color = decompressed_data[1 + basePos + x]
5020 basex = y * stride + x
5021 left = 0
5022 up = 0
5023
5024 if x > 2:
5025 left = _get_pixel(basex - 3)
5026 if y > 0:
5027 up = _get_pixel(basex - stride)
5028
5029 if filter_type == 1: # Sub
5030 color = (color + left) & 0xff
5031 elif filter_type == 2: # Up
5032 color = (color + up) & 0xff
5033 elif filter_type == 3: # Average
5034 color = (color + ((left + up) >> 1)) & 0xff
5035 elif filter_type == 4: # Paeth
5036 a = left
5037 b = up
5038 c = 0
5039
5040 if x > 2 and y > 0:
5041 c = _get_pixel(basex - stride - 3)
5042
5043 p = a + b - c
5044
5045 pa = abs(p - a)
5046 pb = abs(p - b)
5047 pc = abs(p - c)
5048
5049 if pa <= pb and pa <= pc:
5050 color = (color + a) & 0xff
5051 elif pb <= pc:
5052 color = (color + b) & 0xff
5053 else:
5054 color = (color + c) & 0xff
5055
5056 current_row.append(color)
5057
5058 return width, height, pixels
efa97bdc
YCH
5059
5060
5061def write_xattr(path, key, value):
6f7563be 5062 # Windows: Write xattrs to NTFS Alternate Data Streams:
5063 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5064 if compat_os_name == 'nt':
5065 assert ':' not in key
5066 assert os.path.exists(path)
efa97bdc
YCH
5067
5068 try:
6f7563be 5069 with open(f'{path}:{key}', 'wb') as f:
5070 f.write(value)
86e5f3ed 5071 except OSError as e:
efa97bdc 5072 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5073 return
efa97bdc 5074
6f7563be 5075 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5076
6f7563be 5077 setxattr = None
5078 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5079 # Unicode arguments are not supported in pyxattr until version 0.5.0
5080 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5081 if version_tuple(xattr.__version__) >= (0, 5, 0):
5082 setxattr = xattr.set
5083 elif xattr:
5084 setxattr = xattr.setxattr
efa97bdc 5085
6f7563be 5086 if setxattr:
5087 try:
5088 setxattr(path, key, value)
5089 except OSError as e:
5090 raise XAttrMetadataError(e.errno, e.strerror)
5091 return
efa97bdc 5092
6f7563be 5093 # UNIX Method 2. Use setfattr/xattr executables
5094 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5095 else 'xattr' if check_executable('xattr', ['-h']) else None)
5096 if not exe:
5097 raise XAttrUnavailableError(
5098 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5099 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5100
0f06bcd7 5101 value = value.decode()
6f7563be 5102 try:
f0c9fb96 5103 _, stderr, returncode = Popen.run(
6f7563be 5104 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5105 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5106 except OSError as e:
5107 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5108 if returncode:
5109 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5110
5111
5112def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5113 start_date = datetime.date(1950, 1, 1)
5114 end_date = datetime.date(1995, 12, 31)
5115 offset = random.randint(0, (end_date - start_date).days)
5116 random_date = start_date + datetime.timedelta(offset)
0c265486 5117 return {
aa374bc7
AS
5118 year_field: str(random_date.year),
5119 month_field: str(random_date.month),
5120 day_field: str(random_date.day),
0c265486 5121 }
732044af 5122
c76eb41b 5123
732044af 5124# Templates for internet shortcut files, which are plain text files.
e5a998f3 5125DOT_URL_LINK_TEMPLATE = '''\
732044af 5126[InternetShortcut]
5127URL=%(url)s
e5a998f3 5128'''
732044af 5129
e5a998f3 5130DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5131<?xml version="1.0" encoding="UTF-8"?>
5132<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5133<plist version="1.0">
5134<dict>
5135\t<key>URL</key>
5136\t<string>%(url)s</string>
5137</dict>
5138</plist>
e5a998f3 5139'''
732044af 5140
e5a998f3 5141DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5142[Desktop Entry]
5143Encoding=UTF-8
5144Name=%(filename)s
5145Type=Link
5146URL=%(url)s
5147Icon=text-html
e5a998f3 5148'''
732044af 5149
08438d2c 5150LINK_TEMPLATES = {
5151 'url': DOT_URL_LINK_TEMPLATE,
5152 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5153 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5154}
5155
732044af 5156
5157def iri_to_uri(iri):
5158 """
5159 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5160
5161 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5162 """
5163
14f25df2 5164 iri_parts = urllib.parse.urlparse(iri)
732044af 5165
5166 if '[' in iri_parts.netloc:
5167 raise ValueError('IPv6 URIs are not, yet, supported.')
5168 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5169
5170 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5171
5172 net_location = ''
5173 if iri_parts.username:
f9934b96 5174 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5175 if iri_parts.password is not None:
f9934b96 5176 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5177 net_location += '@'
5178
0f06bcd7 5179 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5180 # The 'idna' encoding produces ASCII text.
5181 if iri_parts.port is not None and iri_parts.port != 80:
5182 net_location += ':' + str(iri_parts.port)
5183
f9934b96 5184 return urllib.parse.urlunparse(
732044af 5185 (iri_parts.scheme,
5186 net_location,
5187
f9934b96 5188 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5189
5190 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5191 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5192
5193 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5194 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5195
f9934b96 5196 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5197
5198 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5199
5200
5201def to_high_limit_path(path):
5202 if sys.platform in ['win32', 'cygwin']:
5203 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5204 return '\\\\?\\' + os.path.abspath(path)
732044af 5205
5206 return path
76d321f6 5207
c76eb41b 5208
7b2c3f47 5209def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5210 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5211 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5212 return default
7b2c3f47 5213 return template % func(val)
00dd0cd5 5214
5215
5216def clean_podcast_url(url):
5217 return re.sub(r'''(?x)
5218 (?:
5219 (?:
5220 chtbl\.com/track|
5221 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5222 play\.podtrac\.com
5223 )/[^/]+|
5224 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5225 flex\.acast\.com|
5226 pd(?:
5227 cn\.co| # https://podcorn.com/analytics-prefix/
5228 st\.fm # https://podsights.com/docs/
5229 )/e
5230 )/''', '', url)
ffcb8191
THD
5231
5232
5233_HEX_TABLE = '0123456789abcdef'
5234
5235
5236def random_uuidv4():
5237 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5238
5239
5240def make_dir(path, to_screen=None):
5241 try:
5242 dn = os.path.dirname(path)
5243 if dn and not os.path.exists(dn):
5244 os.makedirs(dn)
5245 return True
86e5f3ed 5246 except OSError as err:
0202b52a 5247 if callable(to_screen) is not None:
5248 to_screen('unable to create directory ' + error_to_compat_str(err))
5249 return False
f74980cb 5250
5251
5252def get_executable_path():
b5899f4f 5253 from .update import _get_variant_and_executable_path
c487cf00 5254
b5899f4f 5255 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5256
5257
2f567473 5258def load_plugins(name, suffix, namespace):
3ae5e797 5259 classes = {}
19a03940 5260 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5261 plugins_spec = importlib.util.spec_from_file_location(
5262 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5263 plugins = importlib.util.module_from_spec(plugins_spec)
5264 sys.modules[plugins_spec.name] = plugins
5265 plugins_spec.loader.exec_module(plugins)
f74980cb 5266 for name in dir(plugins):
2f567473 5267 if name in namespace:
5268 continue
5269 if not name.endswith(suffix):
f74980cb 5270 continue
5271 klass = getattr(plugins, name)
3ae5e797 5272 classes[name] = namespace[name] = klass
f74980cb 5273 return classes
06167fbb 5274
5275
325ebc17 5276def traverse_obj(
352d63fd 5277 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5278 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5279 ''' Traverse nested list/dict/tuple
8f334380 5280 @param path_list A list of paths which are checked one by one.
19a03940 5281 Each path is a list of keys where each key is a:
5282 - None: Do nothing
07a1250e 5283 - string: A dictionary key / regex group
19a03940 5284 - int: An index into a list
5285 - tuple: A list of keys all of which will be traversed
5286 - Ellipsis: Fetch all values in the object
5287 - Function: Takes the key and value as arguments
5288 and returns whether the key matches or not
325ebc17 5289 @param default Default value to return
352d63fd 5290 @param expected_type Only accept final value of this type (Can also be any callable)
5291 @param get_all Return all the values obtained from a path or only the first one
324ad820 5292 @param casesense Whether to consider dictionary keys as case sensitive
07a1250e 5293
5294 The following are only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API
5295
5296 @param path_list In addition to the above,
5297 - dict: Given {k:v, ...}; return {k: traverse_obj(obj, v), ...}
324ad820 5298 @param is_user_input Whether the keys are generated from user input. If True,
5299 strings are converted to int/slice if necessary
5300 @param traverse_string Whether to traverse inside strings. If True, any
5301 non-compatible object will also be converted into a string
07a1250e 5302 ''' # TODO: Write tests
325ebc17 5303 if not casesense:
dbf5416a 5304 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5305 path_list = (map(_lower, variadic(path)) for path in path_list)
5306
5307 def _traverse_obj(obj, path, _current_depth=0):
5308 nonlocal depth
5309 path = tuple(variadic(path))
5310 for i, key in enumerate(path):
1797b073 5311 if None in (key, obj):
5312 return obj
8f334380 5313 if isinstance(key, (list, tuple)):
5314 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5315 key = ...
07a1250e 5316
8f334380 5317 if key is ...:
5318 obj = (obj.values() if isinstance(obj, dict)
5319 else obj if isinstance(obj, (list, tuple, LazyList))
5320 else str(obj) if traverse_string else [])
5321 _current_depth += 1
5322 depth = max(depth, _current_depth)
5323 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
07a1250e 5324 elif isinstance(key, dict):
5325 obj = filter_dict({k: _traverse_obj(obj, v, _current_depth) for k, v in key.items()})
2614f646 5326 elif callable(key):
5327 if isinstance(obj, (list, tuple, LazyList)):
5328 obj = enumerate(obj)
5329 elif isinstance(obj, dict):
5330 obj = obj.items()
5331 else:
5332 if not traverse_string:
5333 return None
5334 obj = str(obj)
5335 _current_depth += 1
5336 depth = max(depth, _current_depth)
e6f868a6 5337 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 5338 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5339 obj = (obj.get(key) if casesense or (key in obj)
5340 else next((v for k, v in obj.items() if _lower(k) == key), None))
5341 else:
5342 if is_user_input:
5343 key = (int_or_none(key) if ':' not in key
5344 else slice(*map(int_or_none, key.split(':'))))
8f334380 5345 if key == slice(None):
575e17a1 5346 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5347 if not isinstance(key, (int, slice)):
9fea350f 5348 return None
8f334380 5349 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5350 if not traverse_string:
5351 return None
5352 obj = str(obj)
5353 try:
5354 obj = obj[key]
5355 except IndexError:
324ad820 5356 return None
325ebc17 5357 return obj
5358
352d63fd 5359 if isinstance(expected_type, type):
5360 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5361 else:
7b2c3f47 5362 type_test = expected_type or IDENTITY
352d63fd 5363
8f334380 5364 for path in path_list:
5365 depth = 0
5366 val = _traverse_obj(obj, path)
325ebc17 5367 if val is not None:
8f334380 5368 if depth:
5369 for _ in range(depth - 1):
6586bca9 5370 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5371 val = [v for v in map(type_test, val) if v is not None]
8f334380 5372 if val:
352d63fd 5373 return val if get_all else val[0]
5374 else:
5375 val = type_test(val)
5376 if val is not None:
8f334380 5377 return val
325ebc17 5378 return default
324ad820 5379
5380
5381def traverse_dict(dictn, keys, casesense=True):
da4db748 5382 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5383 f'in a future version. Use "{__name__}.traverse_obj" instead')
ee8dd27a 5384 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5385
5386
ff91cf74 5387def get_first(obj, keys, **kwargs):
5388 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5389
5390
4b4b7f74 5391def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5392 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5393
5394
3e9b66d7
LNO
5395def time_seconds(**kwargs):
5396 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5397 return t.timestamp()
5398
5399
49fa4d9a
N
5400# create a JSON Web Signature (jws) with HS256 algorithm
5401# the resulting format is in JWS Compact Serialization
5402# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5403# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5404def jwt_encode_hs256(payload_data, key, headers={}):
5405 header_data = {
5406 'alg': 'HS256',
5407 'typ': 'JWT',
5408 }
5409 if headers:
5410 header_data.update(headers)
0f06bcd7 5411 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5412 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5413 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5414 signature_b64 = base64.b64encode(h.digest())
5415 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5416 return token
819e0531 5417
5418
16b0d7e6 5419# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5420def jwt_decode_hs256(jwt):
5421 header_b64, payload_b64, signature_b64 = jwt.split('.')
5422 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5423 return payload_data
5424
5425
53973b4d 5426WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5427
5428
0b9c08b4 5429@functools.cache
819e0531 5430def supports_terminal_sequences(stream):
5431 if compat_os_name == 'nt':
8a82af35 5432 if not WINDOWS_VT_MODE:
819e0531 5433 return False
5434 elif not os.getenv('TERM'):
5435 return False
5436 try:
5437 return stream.isatty()
5438 except BaseException:
5439 return False
5440
5441
53973b4d 5442def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5443 if get_windows_version() < (10, 0, 10586):
53973b4d 5444 return
5445 global WINDOWS_VT_MODE
53973b4d 5446 try:
f0c9fb96 5447 Popen.run('', shell=True)
53973b4d 5448 except Exception:
5449 return
5450
5451 WINDOWS_VT_MODE = True
5452 supports_terminal_sequences.cache_clear()
5453
5454
ec11a9f4 5455_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5456
5457
5458def remove_terminal_sequences(string):
5459 return _terminal_sequences_re.sub('', string)
5460
5461
5462def number_of_digits(number):
5463 return len('%d' % number)
34921b43 5464
5465
5466def join_nonempty(*values, delim='-', from_dict=None):
5467 if from_dict is not None:
7b2c3f47 5468 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5469 return delim.join(map(str, filter(None, values)))
06e57990 5470
5471
27231526
ZM
5472def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5473 """
5474 Find the largest format dimensions in terms of video width and, for each thumbnail:
5475 * Modify the URL: Match the width with the provided regex and replace with the former width
5476 * Update dimensions
5477
5478 This function is useful with video services that scale the provided thumbnails on demand
5479 """
5480 _keys = ('width', 'height')
5481 max_dimensions = max(
86e5f3ed 5482 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5483 default=(0, 0))
5484 if not max_dimensions[0]:
5485 return thumbnails
5486 return [
5487 merge_dicts(
5488 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5489 dict(zip(_keys, max_dimensions)), thumbnail)
5490 for thumbnail in thumbnails
5491 ]
5492
5493
93c8410d
LNO
5494def parse_http_range(range):
5495 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5496 if not range:
5497 return None, None, None
5498 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5499 if not crg:
5500 return None, None, None
5501 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5502
5503
6b9e832d 5504def read_stdin(what):
5505 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5506 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5507 return sys.stdin
5508
5509
a904a7f8
L
5510def determine_file_encoding(data):
5511 """
88f60feb 5512 Detect the text encoding used
a904a7f8
L
5513 @returns (encoding, bytes to skip)
5514 """
5515
88f60feb 5516 # BOM marks are given priority over declarations
a904a7f8 5517 for bom, enc in BOMS:
a904a7f8
L
5518 if data.startswith(bom):
5519 return enc, len(bom)
5520
88f60feb 5521 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5522 # We ignore the endianness to get a good enough match
a904a7f8 5523 data = data.replace(b'\0', b'')
88f60feb 5524 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5525 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5526
5527
06e57990 5528class Config:
5529 own_args = None
9e491463 5530 parsed_args = None
06e57990 5531 filename = None
5532 __initialized = False
5533
5534 def __init__(self, parser, label=None):
9e491463 5535 self.parser, self.label = parser, label
06e57990 5536 self._loaded_paths, self.configs = set(), []
5537
5538 def init(self, args=None, filename=None):
5539 assert not self.__initialized
284a60c5 5540 self.own_args, self.filename = args, filename
5541 return self.load_configs()
5542
5543 def load_configs(self):
65662dff 5544 directory = ''
284a60c5 5545 if self.filename:
5546 location = os.path.realpath(self.filename)
65662dff 5547 directory = os.path.dirname(location)
06e57990 5548 if location in self._loaded_paths:
5549 return False
5550 self._loaded_paths.add(location)
5551
284a60c5 5552 self.__initialized = True
5553 opts, _ = self.parser.parse_known_args(self.own_args)
5554 self.parsed_args = self.own_args
9e491463 5555 for location in opts.config_locations or []:
6b9e832d 5556 if location == '-':
5557 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5558 continue
65662dff 5559 location = os.path.join(directory, expand_path(location))
06e57990 5560 if os.path.isdir(location):
5561 location = os.path.join(location, 'yt-dlp.conf')
5562 if not os.path.exists(location):
9e491463 5563 self.parser.error(f'config location {location} does not exist')
06e57990 5564 self.append_config(self.read_file(location), location)
5565 return True
5566
5567 def __str__(self):
5568 label = join_nonempty(
5569 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5570 delim=' ')
5571 return join_nonempty(
5572 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5573 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5574 delim='\n')
5575
5576 @staticmethod
5577 def read_file(filename, default=[]):
5578 try:
a904a7f8 5579 optionf = open(filename, 'rb')
86e5f3ed 5580 except OSError:
06e57990 5581 return default # silently skip if file is not present
a904a7f8
L
5582 try:
5583 enc, skip = determine_file_encoding(optionf.read(512))
5584 optionf.seek(skip, io.SEEK_SET)
5585 except OSError:
5586 enc = None # silently skip read errors
06e57990 5587 try:
5588 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5589 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5590 res = shlex.split(contents, comments=True)
44a6fcff 5591 except Exception as err:
5592 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5593 finally:
5594 optionf.close()
5595 return res
5596
5597 @staticmethod
5598 def hide_login_info(opts):
86e5f3ed 5599 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5600 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5601
5602 def _scrub_eq(o):
5603 m = eqre.match(o)
5604 if m:
5605 return m.group('key') + '=PRIVATE'
5606 else:
5607 return o
5608
5609 opts = list(map(_scrub_eq, opts))
5610 for idx, opt in enumerate(opts):
5611 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5612 opts[idx + 1] = 'PRIVATE'
5613 return opts
5614
5615 def append_config(self, *args, label=None):
9e491463 5616 config = type(self)(self.parser, label)
06e57990 5617 config._loaded_paths = self._loaded_paths
5618 if config.init(*args):
5619 self.configs.append(config)
5620
5621 @property
5622 def all_args(self):
5623 for config in reversed(self.configs):
5624 yield from config.all_args
9e491463 5625 yield from self.parsed_args or []
5626
5627 def parse_known_args(self, **kwargs):
5628 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5629
5630 def parse_args(self):
9e491463 5631 return self.parser.parse_args(self.all_args)
da42679b
LNO
5632
5633
5634class WebSocketsWrapper():
5635 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5636 pool = None
da42679b 5637
3cea3edd 5638 def __init__(self, url, headers=None, connect=True):
059bc4db 5639 self.loop = asyncio.new_event_loop()
9cd08050 5640 # XXX: "loop" is deprecated
5641 self.conn = websockets.connect(
5642 url, extra_headers=headers, ping_interval=None,
5643 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5644 if connect:
5645 self.__enter__()
15dfb392 5646 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5647
5648 def __enter__(self):
3cea3edd 5649 if not self.pool:
9cd08050 5650 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5651 return self
5652
5653 def send(self, *args):
5654 self.run_with_loop(self.pool.send(*args), self.loop)
5655
5656 def recv(self, *args):
5657 return self.run_with_loop(self.pool.recv(*args), self.loop)
5658
5659 def __exit__(self, type, value, traceback):
5660 try:
5661 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5662 finally:
5663 self.loop.close()
15dfb392 5664 self._cancel_all_tasks(self.loop)
da42679b
LNO
5665
5666 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5667 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5668 @staticmethod
5669 def run_with_loop(main, loop):
059bc4db 5670 if not asyncio.iscoroutine(main):
da42679b
LNO
5671 raise ValueError(f'a coroutine was expected, got {main!r}')
5672
5673 try:
5674 return loop.run_until_complete(main)
5675 finally:
5676 loop.run_until_complete(loop.shutdown_asyncgens())
5677 if hasattr(loop, 'shutdown_default_executor'):
5678 loop.run_until_complete(loop.shutdown_default_executor())
5679
5680 @staticmethod
5681 def _cancel_all_tasks(loop):
059bc4db 5682 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5683
5684 if not to_cancel:
5685 return
5686
5687 for task in to_cancel:
5688 task.cancel()
5689
9cd08050 5690 # XXX: "loop" is removed in python 3.10+
da42679b 5691 loop.run_until_complete(
059bc4db 5692 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5693
5694 for task in to_cancel:
5695 if task.cancelled():
5696 continue
5697 if task.exception() is not None:
5698 loop.call_exception_handler({
5699 'message': 'unhandled exception during asyncio.run() shutdown',
5700 'exception': task.exception(),
5701 'task': task,
5702 })
5703
5704
8b7539d2 5705def merge_headers(*dicts):
08d30158 5706 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5707 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5708
5709
b1f94422 5710def cached_method(f):
5711 """Cache a method"""
5712 signature = inspect.signature(f)
5713
5714 @functools.wraps(f)
5715 def wrapper(self, *args, **kwargs):
5716 bound_args = signature.bind(self, *args, **kwargs)
5717 bound_args.apply_defaults()
5718 key = tuple(bound_args.arguments.values())
5719
5720 if not hasattr(self, '__cached_method__cache'):
5721 self.__cached_method__cache = {}
5722 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5723 if key not in cache:
5724 cache[key] = f(self, *args, **kwargs)
5725 return cache[key]
5726 return wrapper
5727
5728
28787f16 5729class classproperty:
b1f94422 5730 """property access for class methods"""
c487cf00 5731
5732 def __init__(self, func):
5733 functools.update_wrapper(self, func)
5734 self.func = func
28787f16 5735
5736 def __get__(self, _, cls):
c487cf00 5737 return self.func(cls)
19a03940 5738
5739
64fa820c 5740class Namespace(types.SimpleNamespace):
591bb9d3 5741 """Immutable namespace"""
591bb9d3 5742
7896214c 5743 def __iter__(self):
64fa820c 5744 return iter(self.__dict__.values())
7896214c 5745
64fa820c 5746 @property
5747 def items_(self):
5748 return self.__dict__.items()
9b8ee23b 5749
5750
8dc59305 5751MEDIA_EXTENSIONS = Namespace(
5752 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5753 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5754 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5755 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5756 thumbnails=('jpg', 'png', 'webp'),
5757 storyboards=('mhtml', ),
5758 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5759 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5760)
5761MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5762MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5763
5764KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5765
5766
be5c1ae8 5767class RetryManager:
5768 """Usage:
5769 for retry in RetryManager(...):
5770 try:
5771 ...
5772 except SomeException as err:
5773 retry.error = err
5774 continue
5775 """
5776 attempt, _error = 0, None
5777
5778 def __init__(self, _retries, _error_callback, **kwargs):
5779 self.retries = _retries or 0
5780 self.error_callback = functools.partial(_error_callback, **kwargs)
5781
5782 def _should_retry(self):
5783 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5784
5785 @property
5786 def error(self):
5787 if self._error is NO_DEFAULT:
5788 return None
5789 return self._error
5790
5791 @error.setter
5792 def error(self, value):
5793 self._error = value
5794
5795 def __iter__(self):
5796 while self._should_retry():
5797 self.error = NO_DEFAULT
5798 self.attempt += 1
5799 yield self
5800 if self.error:
5801 self.error_callback(self.error, self.attempt, self.retries)
5802
5803 @staticmethod
5804 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5805 """Utility function for reporting retries"""
5806 if count > retries:
5807 if error:
5808 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5809 raise e
5810
5811 if not count:
5812 return warn(e)
5813 elif isinstance(e, ExtractorError):
3ce29336 5814 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5815 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5816
5817 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5818 if delay:
5819 info(f'Sleeping {delay:.2f} seconds ...')
5820 time.sleep(delay)
5821
5822
0647d925 5823def make_archive_id(ie, video_id):
5824 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5825 return f'{ie_key.lower()} {video_id}'
5826
5827
a1c5bd82 5828def truncate_string(s, left, right=0):
5829 assert left > 3 and right >= 0
5830 if s is None or len(s) <= left + right:
5831 return s
5832 return f'{s[:left-3]}...{s[-right:]}'
5833
5834
5314b521 5835def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5836 assert 'all' in alias_dict, '"all" alias is required'
5837 requested = list(start or [])
5838 for val in options:
5839 discard = val.startswith('-')
5840 if discard:
5841 val = val[1:]
5842
5843 if val in alias_dict:
5844 val = alias_dict[val] if not discard else [
5845 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5846 # NB: Do not allow regex in aliases for performance
5847 requested = orderedSet_from_options(val, alias_dict, start=requested)
5848 continue
5849
5850 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5851 else [val] if val in alias_dict['all'] else None)
5852 if current is None:
5853 raise ValueError(val)
5854
5855 if discard:
5856 for item in current:
5857 while item in requested:
5858 requested.remove(item)
5859 else:
5860 requested.extend(current)
5861
5862 return orderedSet(requested)
5863
5864
9b8ee23b 5865# Deprecated
5866has_certifi = bool(certifi)
5867has_websockets = bool(websockets)