]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[cleanup] Consistent style for file heads
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
15dfb392 1import atexit
1e399778 2import base64
5bc880b9 3import binascii
912b38b4 4import calendar
676eb3f2 5import codecs
c380cc28 6import collections
62e609ab 7import contextlib
e3946f98 8import ctypes
c496ca96 9import datetime
0c265486 10import email.header
f8271158 11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
49fa4d9a
N
14import hashlib
15import hmac
ac668111 16import html.entities
17import html.parser
54007a45 18import http.client
19import http.cookiejar
019a94f7 20import importlib.util
03f9daab 21import io
79a2e94e 22import itertools
f4bfd65f 23import json
d77c3dfd 24import locale
02dbf93f 25import math
f8271158 26import mimetypes
347de493 27import operator
d77c3dfd 28import os
c496ca96 29import platform
773f291d 30import random
d77c3dfd 31import re
f8271158 32import shlex
c496ca96 33import socket
79a2e94e 34import ssl
ac668111 35import struct
1c088fa8 36import subprocess
d77c3dfd 37import sys
181c8655 38import tempfile
c380cc28 39import time
01951dda 40import traceback
64fa820c 41import types
f8271158 42import urllib.parse
ac668111 43import urllib.request
bcf89ce6 44import xml.etree.ElementTree
d77c3dfd 45import zlib
d77c3dfd 46
c487cf00 47from .compat import asyncio, functools # isort: split
8c25f81b 48from .compat import (
36e6f62c 49 compat_etree_fromstring,
51098426 50 compat_expanduser,
f8271158 51 compat_HTMLParseError,
f8271158 52 compat_HTTPError,
efa97bdc 53 compat_os_name,
8c25f81b 54 compat_parse_qs,
702ccf2d 55 compat_shlex_quote,
8c25f81b 56 compat_str,
15707c7e 57 compat_urllib_parse_urlencode,
8c25f81b 58 compat_urllib_parse_urlparse,
8c25f81b
PH
59 compat_urlparse,
60)
ac668111 61from .dependencies import brotli, certifi, websockets, xattr
f8271158 62from .socks import ProxyType, sockssocket
71aff188 63
4644ac55 64
51fb4995
YCH
65def register_socks_protocols():
66 # "Register" SOCKS protocols
d5ae6bb5
YCH
67 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
68 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
69 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
70 if scheme not in compat_urlparse.uses_netloc:
71 compat_urlparse.uses_netloc.append(scheme)
72
73
468e2e92
FV
74# This is not clearly defined otherwise
75compiled_regex_type = type(re.compile(''))
76
f7a147e3
S
77
78def random_user_agent():
79 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
80 _CHROME_VERSIONS = (
19b4c74d 81 '90.0.4430.212',
82 '90.0.4430.24',
83 '90.0.4430.70',
84 '90.0.4430.72',
85 '90.0.4430.85',
86 '90.0.4430.93',
87 '91.0.4472.101',
88 '91.0.4472.106',
89 '91.0.4472.114',
90 '91.0.4472.124',
91 '91.0.4472.164',
92 '91.0.4472.19',
93 '91.0.4472.77',
94 '92.0.4515.107',
95 '92.0.4515.115',
96 '92.0.4515.131',
97 '92.0.4515.159',
98 '92.0.4515.43',
99 '93.0.4556.0',
100 '93.0.4577.15',
101 '93.0.4577.63',
102 '93.0.4577.82',
103 '94.0.4606.41',
104 '94.0.4606.54',
105 '94.0.4606.61',
106 '94.0.4606.71',
107 '94.0.4606.81',
108 '94.0.4606.85',
109 '95.0.4638.17',
110 '95.0.4638.50',
111 '95.0.4638.54',
112 '95.0.4638.69',
113 '95.0.4638.74',
114 '96.0.4664.18',
115 '96.0.4664.45',
116 '96.0.4664.55',
117 '96.0.4664.93',
118 '97.0.4692.20',
f7a147e3
S
119 )
120 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
121
122
4390d5ec 123SUPPORTED_ENCODINGS = [
124 'gzip', 'deflate'
125]
9b8ee23b 126if brotli:
4390d5ec 127 SUPPORTED_ENCODINGS.append('br')
128
3e669f36 129std_headers = {
f7a147e3 130 'User-Agent': random_user_agent(),
59ae15a5 131 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 132 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 133 'Sec-Fetch-Mode': 'navigate',
3e669f36 134}
f427df17 135
5f6a1245 136
fb37eb25
S
137USER_AGENTS = {
138 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
139}
140
141
bf42a990 142NO_DEFAULT = object()
7b2c3f47 143IDENTITY = lambda x: x
bf42a990 144
7105440c
YCH
145ENGLISH_MONTH_NAMES = [
146 'January', 'February', 'March', 'April', 'May', 'June',
147 'July', 'August', 'September', 'October', 'November', 'December']
148
f6717dec
S
149MONTH_NAMES = {
150 'en': ENGLISH_MONTH_NAMES,
151 'fr': [
3e4185c3
S
152 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
153 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 154}
a942d6cb 155
a7aaa398
S
156KNOWN_EXTENSIONS = (
157 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
158 'flv', 'f4v', 'f4a', 'f4b',
159 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
160 'mkv', 'mka', 'mk3d',
161 'avi', 'divx',
162 'mov',
163 'asf', 'wmv', 'wma',
164 '3gp', '3g2',
165 'mp3',
166 'flac',
167 'ape',
168 'wav',
169 'f4f', 'f4m', 'm3u8', 'smil')
170
c587cbb7 171# needed for sanitizing filenames in restricted mode
c8827027 172ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
173 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
174 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 175
46f59e89
S
176DATE_FORMATS = (
177 '%d %B %Y',
178 '%d %b %Y',
179 '%B %d %Y',
cb655f34
S
180 '%B %dst %Y',
181 '%B %dnd %Y',
9d30c213 182 '%B %drd %Y',
cb655f34 183 '%B %dth %Y',
46f59e89 184 '%b %d %Y',
cb655f34
S
185 '%b %dst %Y',
186 '%b %dnd %Y',
9d30c213 187 '%b %drd %Y',
cb655f34 188 '%b %dth %Y',
46f59e89
S
189 '%b %dst %Y %I:%M',
190 '%b %dnd %Y %I:%M',
9d30c213 191 '%b %drd %Y %I:%M',
46f59e89
S
192 '%b %dth %Y %I:%M',
193 '%Y %m %d',
194 '%Y-%m-%d',
bccdbd22 195 '%Y.%m.%d.',
46f59e89 196 '%Y/%m/%d',
81c13222 197 '%Y/%m/%d %H:%M',
46f59e89 198 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
199 '%Y%m%d%H%M',
200 '%Y%m%d%H%M%S',
4f3fa23e 201 '%Y%m%d',
0c1c6f4b 202 '%Y-%m-%d %H:%M',
46f59e89
S
203 '%Y-%m-%d %H:%M:%S',
204 '%Y-%m-%d %H:%M:%S.%f',
5014558a 205 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
206 '%d.%m.%Y %H:%M',
207 '%d.%m.%Y %H.%M',
208 '%Y-%m-%dT%H:%M:%SZ',
209 '%Y-%m-%dT%H:%M:%S.%fZ',
210 '%Y-%m-%dT%H:%M:%S.%f0Z',
211 '%Y-%m-%dT%H:%M:%S',
212 '%Y-%m-%dT%H:%M:%S.%f',
213 '%Y-%m-%dT%H:%M',
c6eed6b8
S
214 '%b %d %Y at %H:%M',
215 '%b %d %Y at %H:%M:%S',
b555ae9b
S
216 '%B %d %Y at %H:%M',
217 '%B %d %Y at %H:%M:%S',
a63d9bd0 218 '%H:%M %d-%b-%Y',
46f59e89
S
219)
220
221DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
222DATE_FORMATS_DAY_FIRST.extend([
223 '%d-%m-%Y',
224 '%d.%m.%Y',
225 '%d.%m.%y',
226 '%d/%m/%Y',
227 '%d/%m/%y',
228 '%d/%m/%Y %H:%M:%S',
229])
230
231DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
232DATE_FORMATS_MONTH_FIRST.extend([
233 '%m-%d-%Y',
234 '%m.%d.%Y',
235 '%m/%d/%Y',
236 '%m/%d/%y',
237 '%m/%d/%Y %H:%M:%S',
238])
239
06b3fe29 240PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 241JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 242
1d485a1a 243NUMBER_RE = r'\d+(?:\.\d+)?'
244
7105440c 245
0b9c08b4 246@functools.cache
d77c3dfd 247def preferredencoding():
59ae15a5 248 """Get preferred encoding.
d77c3dfd 249
59ae15a5
PH
250 Returns the best encoding scheme for the system, based on
251 locale.getpreferredencoding() and some further tweaks.
252 """
253 try:
254 pref = locale.getpreferredencoding()
28e614de 255 'TEST'.encode(pref)
70a1165b 256 except Exception:
59ae15a5 257 pref = 'UTF-8'
bae611f2 258
59ae15a5 259 return pref
d77c3dfd 260
f4bfd65f 261
181c8655 262def write_json_file(obj, fn):
1394646a 263 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 264
cfb0511d 265 tf = tempfile.NamedTemporaryFile(
266 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
267 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
268
269 try:
270 with tf:
45d86abe 271 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
272 if sys.platform == 'win32':
273 # Need to remove existing file on Windows, else os.rename raises
274 # WindowsError or FileExistsError.
19a03940 275 with contextlib.suppress(OSError):
1394646a 276 os.unlink(fn)
19a03940 277 with contextlib.suppress(OSError):
9cd5f54e
R
278 mask = os.umask(0)
279 os.umask(mask)
280 os.chmod(tf.name, 0o666 & ~mask)
181c8655 281 os.rename(tf.name, fn)
70a1165b 282 except Exception:
19a03940 283 with contextlib.suppress(OSError):
181c8655 284 os.remove(tf.name)
181c8655
PH
285 raise
286
287
cfb0511d 288def find_xpath_attr(node, xpath, key, val=None):
289 """ Find the xpath xpath[@key=val] """
290 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 291 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 292 return node.find(expr)
59ae56fa 293
d7e66d39
JMF
294# On python2.6 the xml.etree.ElementTree.Element methods don't support
295# the namespace parameter
5f6a1245
JW
296
297
d7e66d39
JMF
298def xpath_with_ns(path, ns_map):
299 components = [c.split(':') for c in path.split('/')]
300 replaced = []
301 for c in components:
302 if len(c) == 1:
303 replaced.append(c[0])
304 else:
305 ns, tag = c
306 replaced.append('{%s}%s' % (ns_map[ns], tag))
307 return '/'.join(replaced)
308
d77c3dfd 309
a41fb80c 310def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 311 def _find_xpath(xpath):
f9934b96 312 return node.find(xpath)
578c0745
S
313
314 if isinstance(xpath, (str, compat_str)):
315 n = _find_xpath(xpath)
316 else:
317 for xp in xpath:
318 n = _find_xpath(xp)
319 if n is not None:
320 break
d74bebd5 321
8e636da4 322 if n is None:
bf42a990
S
323 if default is not NO_DEFAULT:
324 return default
325 elif fatal:
bf0ff932
PH
326 name = xpath if name is None else name
327 raise ExtractorError('Could not find XML element %s' % name)
328 else:
329 return None
a41fb80c
S
330 return n
331
332
333def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
334 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
335 if n is None or n == default:
336 return n
337 if n.text is None:
338 if default is not NO_DEFAULT:
339 return default
340 elif fatal:
341 name = xpath if name is None else name
342 raise ExtractorError('Could not find XML element\'s text %s' % name)
343 else:
344 return None
345 return n.text
a41fb80c
S
346
347
348def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
349 n = find_xpath_attr(node, xpath, key)
350 if n is None:
351 if default is not NO_DEFAULT:
352 return default
353 elif fatal:
86e5f3ed 354 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
355 raise ExtractorError('Could not find XML attribute %s' % name)
356 else:
357 return None
358 return n.attrib[key]
bf0ff932
PH
359
360
c487cf00 361def get_element_by_id(id, html, **kwargs):
43e8fafd 362 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 363 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 364
12ea2f30 365
c487cf00 366def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 367 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 368 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
369
370
84c237fb 371def get_element_by_class(class_name, html):
2af12ad9
TC
372 """Return the content of the first tag with the specified class in the passed HTML document"""
373 retval = get_elements_by_class(class_name, html)
374 return retval[0] if retval else None
375
376
6f32a0b5
ZM
377def get_element_html_by_class(class_name, html):
378 """Return the html of the first tag with the specified class in the passed HTML document"""
379 retval = get_elements_html_by_class(class_name, html)
380 return retval[0] if retval else None
381
382
c487cf00 383def get_element_by_attribute(attribute, value, html, **kwargs):
384 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
385 return retval[0] if retval else None
386
387
c487cf00 388def get_element_html_by_attribute(attribute, value, html, **kargs):
389 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
390 return retval[0] if retval else None
391
392
c487cf00 393def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
394 """Return the content of all tags with the specified class in the passed HTML document as a list"""
395 return get_elements_by_attribute(
64fa820c 396 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
397 html, escape_value=False)
398
399
6f32a0b5
ZM
400def get_elements_html_by_class(class_name, html):
401 """Return the html of all tags with the specified class in the passed HTML document as a list"""
402 return get_elements_html_by_attribute(
64fa820c 403 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
404 html, escape_value=False)
405
406
407def get_elements_by_attribute(*args, **kwargs):
43e8fafd 408 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
409 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
410
411
412def get_elements_html_by_attribute(*args, **kwargs):
413 """Return the html of the tag with the specified attribute in the passed HTML document"""
414 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
415
416
417def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
418 """
419 Return the text (content) and the html (whole) of the tag with the specified
420 attribute in the passed HTML document
421 """
9e6dd238 422
86e5f3ed 423 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 424
84c237fb
YCH
425 value = re.escape(value) if escape_value else value
426
86e5f3ed 427 partial_element_re = rf'''(?x)
6f32a0b5 428 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 429 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 430 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
431 '''
38285056 432
0254f162
ZM
433 for m in re.finditer(partial_element_re, html):
434 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 435
0254f162
ZM
436 yield (
437 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
438 whole
439 )
a921f407 440
c5229f39 441
ac668111 442class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
443 """
444 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
445 closing tag for the first opening tag it has encountered, and can be used
446 as a context manager
447 """
448
449 class HTMLBreakOnClosingTagException(Exception):
450 pass
451
452 def __init__(self):
453 self.tagstack = collections.deque()
ac668111 454 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
455
456 def __enter__(self):
457 return self
458
459 def __exit__(self, *_):
460 self.close()
461
462 def close(self):
463 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
464 # so data remains buffered; we no longer have any interest in it, thus
465 # override this method to discard it
466 pass
467
468 def handle_starttag(self, tag, _):
469 self.tagstack.append(tag)
470
471 def handle_endtag(self, tag):
472 if not self.tagstack:
473 raise compat_HTMLParseError('no tags in the stack')
474 while self.tagstack:
475 inner_tag = self.tagstack.pop()
476 if inner_tag == tag:
477 break
478 else:
479 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
480 if not self.tagstack:
481 raise self.HTMLBreakOnClosingTagException()
482
483
484def get_element_text_and_html_by_tag(tag, html):
485 """
486 For the first element with the specified tag in the passed HTML document
487 return its' content (text) and the whole element (html)
488 """
489 def find_or_raise(haystack, needle, exc):
490 try:
491 return haystack.index(needle)
492 except ValueError:
493 raise exc
494 closing_tag = f'</{tag}>'
495 whole_start = find_or_raise(
496 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
497 content_start = find_or_raise(
498 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
499 content_start += whole_start + 1
500 with HTMLBreakOnClosingTagParser() as parser:
501 parser.feed(html[whole_start:content_start])
502 if not parser.tagstack or parser.tagstack[0] != tag:
503 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
504 offset = content_start
505 while offset < len(html):
506 next_closing_tag_start = find_or_raise(
507 html[offset:], closing_tag,
508 compat_HTMLParseError(f'closing {tag} tag not found'))
509 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
510 try:
511 parser.feed(html[offset:offset + next_closing_tag_end])
512 offset += next_closing_tag_end
513 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
514 return html[content_start:offset + next_closing_tag_start], \
515 html[whole_start:offset + next_closing_tag_end]
516 raise compat_HTMLParseError('unexpected end of html')
517
518
ac668111 519class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 520 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 521
8bb56eee 522 def __init__(self):
c5229f39 523 self.attrs = {}
ac668111 524 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
525
526 def handle_starttag(self, tag, attrs):
527 self.attrs = dict(attrs)
528
c5229f39 529
ac668111 530class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
531 """HTML parser to gather the attributes for the elements of a list"""
532
533 def __init__(self):
ac668111 534 html.parser.HTMLParser.__init__(self)
73673ccf
FF
535 self.items = []
536 self._level = 0
537
538 def handle_starttag(self, tag, attrs):
539 if tag == 'li' and self._level == 0:
540 self.items.append(dict(attrs))
541 self._level += 1
542
543 def handle_endtag(self, tag):
544 self._level -= 1
545
546
8bb56eee
BF
547def extract_attributes(html_element):
548 """Given a string for an HTML element such as
549 <el
550 a="foo" B="bar" c="&98;az" d=boz
551 empty= noval entity="&amp;"
552 sq='"' dq="'"
553 >
554 Decode and return a dictionary of attributes.
555 {
556 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
557 'empty': '', 'noval': None, 'entity': '&',
558 'sq': '"', 'dq': '\''
559 }.
8bb56eee
BF
560 """
561 parser = HTMLAttributeParser()
19a03940 562 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
563 parser.feed(html_element)
564 parser.close()
8bb56eee 565 return parser.attrs
9e6dd238 566
c5229f39 567
73673ccf
FF
568def parse_list(webpage):
569 """Given a string for an series of HTML <li> elements,
570 return a dictionary of their attributes"""
571 parser = HTMLListAttrsParser()
572 parser.feed(webpage)
573 parser.close()
574 return parser.items
575
576
9e6dd238 577def clean_html(html):
59ae15a5 578 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
579
580 if html is None: # Convenience for sanitizing descriptions etc.
581 return html
582
49185227 583 html = re.sub(r'\s+', ' ', html)
584 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
585 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
586 # Strip html tags
587 html = re.sub('<.*?>', '', html)
588 # Replace html entities
589 html = unescapeHTML(html)
7decf895 590 return html.strip()
9e6dd238
FV
591
592
b7c47b74 593class LenientJSONDecoder(json.JSONDecoder):
594 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
595 self.transform_source, self.ignore_extra = transform_source, ignore_extra
596 super().__init__(*args, **kwargs)
597
598 def decode(self, s):
599 if self.transform_source:
600 s = self.transform_source(s)
601 if self.ignore_extra:
602 return self.raw_decode(s.lstrip())[0]
603 return super().decode(s)
604
605
d77c3dfd 606def sanitize_open(filename, open_mode):
59ae15a5
PH
607 """Try to open the given filename, and slightly tweak it if this fails.
608
609 Attempts to open the given filename. If this fails, it tries to change
610 the filename slightly, step by step, until it's either able to open it
611 or it fails and raises a final exception, like the standard open()
612 function.
613
614 It returns the tuple (stream, definitive_file_name).
615 """
0edb3e33 616 if filename == '-':
617 if sys.platform == 'win32':
618 import msvcrt
619 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
620 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 621
0edb3e33 622 for attempt in range(2):
623 try:
624 try:
89737671 625 if sys.platform == 'win32':
b506289f 626 # FIXME: An exclusive lock also locks the file from being read.
627 # Since windows locks are mandatory, don't lock the file on windows (for now).
628 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 629 raise LockingUnsupportedError()
0edb3e33 630 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 631 except OSError:
0edb3e33 632 stream = open(filename, open_mode)
8a82af35 633 return stream, filename
86e5f3ed 634 except OSError as err:
0edb3e33 635 if attempt or err.errno in (errno.EACCES,):
636 raise
637 old_filename, filename = filename, sanitize_path(filename)
638 if old_filename == filename:
639 raise
d77c3dfd
FV
640
641
642def timeconvert(timestr):
59ae15a5
PH
643 """Convert RFC 2822 defined time string into system timestamp"""
644 timestamp = None
645 timetuple = email.utils.parsedate_tz(timestr)
646 if timetuple is not None:
647 timestamp = email.utils.mktime_tz(timetuple)
648 return timestamp
1c469a94 649
5f6a1245 650
5c3895ff 651def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 652 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 653 @param restricted Use a stricter subset of allowed characters
654 @param is_id Whether this is an ID that should be kept unchanged if possible.
655 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 656 """
5c3895ff 657 if s == '':
658 return ''
659
59ae15a5 660 def replace_insane(char):
c587cbb7
AT
661 if restricted and char in ACCENT_CHARS:
662 return ACCENT_CHARS[char]
91dd88b9 663 elif not restricted and char == '\n':
5c3895ff 664 return '\0 '
91dd88b9 665 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
666 return ''
667 elif char == '"':
668 return '' if restricted else '\''
669 elif char == ':':
5c3895ff 670 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 671 elif char in '\\/|*<>':
5c3895ff 672 return '\0_'
673 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
674 return '\0_'
59ae15a5
PH
675 return char
676
5c3895ff 677 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 678 result = ''.join(map(replace_insane, s))
5c3895ff 679 if is_id is NO_DEFAULT:
680 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
681 STRIP_RE = '(?:\0.|[ _-])*'
682 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
683 result = result.replace('\0', '') or '_'
684
796173d0
PH
685 if not is_id:
686 while '__' in result:
687 result = result.replace('__', '_')
688 result = result.strip('_')
689 # Common case of "Foreign band name - English song title"
690 if restricted and result.startswith('-_'):
691 result = result[2:]
5a42414b
PH
692 if result.startswith('-'):
693 result = '_' + result[len('-'):]
a7440261 694 result = result.lstrip('.')
796173d0
PH
695 if not result:
696 result = '_'
59ae15a5 697 return result
d77c3dfd 698
5f6a1245 699
c2934512 700def sanitize_path(s, force=False):
a2aaf4db 701 """Sanitizes and normalizes path on Windows"""
c2934512 702 if sys.platform == 'win32':
c4218ac3 703 force = False
c2934512 704 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 705 elif force:
706 drive_or_unc = ''
707 else:
a2aaf4db 708 return s
c2934512 709
be531ef1
S
710 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
711 if drive_or_unc:
a2aaf4db
S
712 norm_path.pop(0)
713 sanitized_path = [
ec85ded8 714 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 715 for path_part in norm_path]
be531ef1
S
716 if drive_or_unc:
717 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 718 elif force and s and s[0] == os.path.sep:
c4218ac3 719 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
720 return os.path.join(*sanitized_path)
721
722
17bcc626 723def sanitize_url(url):
befa4708
S
724 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
725 # the number of unwanted failures due to missing protocol
21633673 726 if url is None:
727 return
728 elif url.startswith('//'):
befa4708
S
729 return 'http:%s' % url
730 # Fix some common typos seen so far
731 COMMON_TYPOS = (
067aa17e 732 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
733 (r'^httpss://', r'https://'),
734 # https://bx1.be/lives/direct-tv/
735 (r'^rmtp([es]?)://', r'rtmp\1://'),
736 )
737 for mistake, fixup in COMMON_TYPOS:
738 if re.match(mistake, url):
739 return re.sub(mistake, fixup, url)
bc6b9bcd 740 return url
17bcc626
S
741
742
5435dcf9
HH
743def extract_basic_auth(url):
744 parts = compat_urlparse.urlsplit(url)
745 if parts.username is None:
746 return url, None
747 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
748 parts.hostname if parts.port is None
749 else '%s:%d' % (parts.hostname, parts.port))))
750 auth_payload = base64.b64encode(
0f06bcd7 751 ('%s:%s' % (parts.username, parts.password or '')).encode())
752 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
753
754
67dda517 755def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 756 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
757 if auth_header is not None:
758 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
759 headers['Authorization'] = auth_header
ac668111 760 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
761
762
51098426
S
763def expand_path(s):
764 """Expand shell variables and ~"""
765 return os.path.expandvars(compat_expanduser(s))
766
767
7e9a6125 768def orderedSet(iterable, *, lazy=False):
769 """Remove all duplicates from the input iterable"""
770 def _iter():
771 seen = [] # Do not use set since the items can be unhashable
772 for x in iterable:
773 if x not in seen:
774 seen.append(x)
775 yield x
776
777 return _iter() if lazy else list(_iter())
d77c3dfd 778
912b38b4 779
55b2f099 780def _htmlentity_transform(entity_with_semicolon):
4e408e47 781 """Transforms an HTML entity to a character."""
55b2f099
YCH
782 entity = entity_with_semicolon[:-1]
783
4e408e47 784 # Known non-numeric HTML entity
ac668111 785 if entity in html.entities.name2codepoint:
786 return chr(html.entities.name2codepoint[entity])
4e408e47 787
55b2f099
YCH
788 # TODO: HTML5 allows entities without a semicolon. For example,
789 # '&Eacuteric' should be decoded as 'Éric'.
ac668111 790 if entity_with_semicolon in html.entities.html5:
791 return html.entities.html5[entity_with_semicolon]
55b2f099 792
91757b0f 793 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
794 if mobj is not None:
795 numstr = mobj.group(1)
28e614de 796 if numstr.startswith('x'):
4e408e47 797 base = 16
28e614de 798 numstr = '0%s' % numstr
4e408e47
PH
799 else:
800 base = 10
067aa17e 801 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 802 with contextlib.suppress(ValueError):
ac668111 803 return chr(int(numstr, base))
4e408e47
PH
804
805 # Unknown entity in name, return its literal representation
7a3f0c00 806 return '&%s;' % entity
4e408e47
PH
807
808
d77c3dfd 809def unescapeHTML(s):
912b38b4
PH
810 if s is None:
811 return None
19a03940 812 assert isinstance(s, str)
d77c3dfd 813
4e408e47 814 return re.sub(
95f3f7c2 815 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 816
8bf48f23 817
cdb19aa4 818def escapeHTML(text):
819 return (
820 text
821 .replace('&', '&amp;')
822 .replace('<', '&lt;')
823 .replace('>', '&gt;')
824 .replace('"', '&quot;')
825 .replace("'", '&#39;')
826 )
827
828
f5b1bca9 829def process_communicate_or_kill(p, *args, **kwargs):
8a82af35 830 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
831 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
832 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 833
834
d3c93ec2 835class Popen(subprocess.Popen):
836 if sys.platform == 'win32':
837 _startupinfo = subprocess.STARTUPINFO()
838 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
839 else:
840 _startupinfo = None
841
f0c9fb96 842 def __init__(self, *args, text=False, **kwargs):
843 if text is True:
844 kwargs['universal_newlines'] = True # For 3.6 compatibility
845 kwargs.setdefault('encoding', 'utf-8')
846 kwargs.setdefault('errors', 'replace')
86e5f3ed 847 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 848
849 def communicate_or_kill(self, *args, **kwargs):
8a82af35 850 try:
851 return self.communicate(*args, **kwargs)
852 except BaseException: # Including KeyboardInterrupt
f0c9fb96 853 self.kill(timeout=None)
8a82af35 854 raise
d3c93ec2 855
f0c9fb96 856 def kill(self, *, timeout=0):
857 super().kill()
858 if timeout != 0:
859 self.wait(timeout=timeout)
860
861 @classmethod
862 def run(cls, *args, **kwargs):
863 with cls(*args, **kwargs) as proc:
864 stdout, stderr = proc.communicate_or_kill()
865 return stdout or '', stderr or '', proc.returncode
866
d3c93ec2 867
aa49acd1
S
868def get_subprocess_encoding():
869 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
870 # For subprocess calls, encode with locale encoding
871 # Refer to http://stackoverflow.com/a/9951851/35070
872 encoding = preferredencoding()
873 else:
874 encoding = sys.getfilesystemencoding()
875 if encoding is None:
876 encoding = 'utf-8'
877 return encoding
878
879
8bf48f23 880def encodeFilename(s, for_subprocess=False):
19a03940 881 assert isinstance(s, str)
cfb0511d 882 return s
aa49acd1
S
883
884
885def decodeFilename(b, for_subprocess=False):
cfb0511d 886 return b
8bf48f23 887
f07b74fc
PH
888
889def encodeArgument(s):
cfb0511d 890 # Legacy code that uses byte strings
891 # Uncomment the following line after fixing all post processors
892 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
893 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
894
895
aa49acd1 896def decodeArgument(b):
cfb0511d 897 return b
aa49acd1
S
898
899
8271226a
PH
900def decodeOption(optval):
901 if optval is None:
902 return optval
903 if isinstance(optval, bytes):
904 optval = optval.decode(preferredencoding())
905
906 assert isinstance(optval, compat_str)
907 return optval
1c256f70 908
5f6a1245 909
aa7785f8 910_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
911
912
913def timetuple_from_msec(msec):
914 secs, msec = divmod(msec, 1000)
915 mins, secs = divmod(secs, 60)
916 hrs, mins = divmod(mins, 60)
917 return _timetuple(hrs, mins, secs, msec)
918
919
cdb19aa4 920def formatSeconds(secs, delim=':', msec=False):
aa7785f8 921 time = timetuple_from_msec(secs * 1000)
922 if time.hours:
923 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
924 elif time.minutes:
925 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 926 else:
aa7785f8 927 ret = '%d' % time.seconds
928 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 929
a0ddb8a2 930
77562778 931def _ssl_load_windows_store_certs(ssl_context, storename):
932 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
933 try:
934 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
935 if encoding == 'x509_asn' and (
936 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
937 except PermissionError:
938 return
939 for cert in certs:
19a03940 940 with contextlib.suppress(ssl.SSLError):
77562778 941 ssl_context.load_verify_locations(cadata=cert)
a2366922 942
77562778 943
944def make_HTTPS_handler(params, **kwargs):
945 opts_check_certificate = not params.get('nocheckcertificate')
946 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
947 context.check_hostname = opts_check_certificate
f81c62a6 948 if params.get('legacyserverconnect'):
949 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 950 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
951 context.set_ciphers('DEFAULT')
8a82af35 952
77562778 953 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
954 if opts_check_certificate:
d5820461 955 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
956 context.load_verify_locations(cafile=certifi.where())
8a82af35 957 try:
958 context.load_default_certs()
959 # Work around the issue in load_default_certs when there are bad certificates. See:
960 # https://github.com/yt-dlp/yt-dlp/issues/1060,
961 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
962 except ssl.SSLError:
963 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
964 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
965 for storename in ('CA', 'ROOT'):
966 _ssl_load_windows_store_certs(context, storename)
967 context.set_default_verify_paths()
968
bb58c9ed 969 client_certfile = params.get('client_certificate')
970 if client_certfile:
971 try:
972 context.load_cert_chain(
973 client_certfile, keyfile=params.get('client_certificate_key'),
974 password=params.get('client_certificate_password'))
975 except ssl.SSLError:
976 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 977
978 # Some servers may reject requests if ALPN extension is not sent. See:
979 # https://github.com/python/cpython/issues/85140
980 # https://github.com/yt-dlp/yt-dlp/issues/3878
981 with contextlib.suppress(NotImplementedError):
982 context.set_alpn_protocols(['http/1.1'])
983
77562778 984 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 985
732ea2f0 986
5873d4cc 987def bug_reports_message(before=';'):
57e0f077 988 from .update import REPOSITORY
989
990 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
991 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
992
993 before = before.rstrip()
994 if not before or before.endswith(('.', '!', '?')):
995 msg = msg[0].title() + msg[1:]
996
997 return (before + ' ' if before else '') + msg
08f2a92c
JMF
998
999
bf5b9d85
PM
1000class YoutubeDLError(Exception):
1001 """Base exception for YoutubeDL errors."""
aa9369a2 1002 msg = None
1003
1004 def __init__(self, msg=None):
1005 if msg is not None:
1006 self.msg = msg
1007 elif self.msg is None:
1008 self.msg = type(self).__name__
1009 super().__init__(self.msg)
bf5b9d85
PM
1010
1011
ac668111 1012network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1013if hasattr(ssl, 'CertificateError'):
1014 network_exceptions.append(ssl.CertificateError)
1015network_exceptions = tuple(network_exceptions)
1016
1017
bf5b9d85 1018class ExtractorError(YoutubeDLError):
1c256f70 1019 """Error during info extraction."""
5f6a1245 1020
1151c407 1021 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1022 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1023 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1024 """
3158150c 1025 if sys.exc_info()[0] in network_exceptions:
9a82b238 1026 expected = True
d5979c5d 1027
7265a219 1028 self.orig_msg = str(msg)
1c256f70 1029 self.traceback = tb
1151c407 1030 self.expected = expected
2eabb802 1031 self.cause = cause
d11271dd 1032 self.video_id = video_id
1151c407 1033 self.ie = ie
1034 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1035 if isinstance(self.exc_info[1], ExtractorError):
1036 self.exc_info = self.exc_info[1].exc_info
1151c407 1037
86e5f3ed 1038 super().__init__(''.join((
a70635b8 1039 format_field(ie, None, '[%s] '),
1040 format_field(video_id, None, '%s: '),
7265a219 1041 msg,
a70635b8 1042 format_field(cause, None, ' (caused by %r)'),
1151c407 1043 '' if expected else bug_reports_message())))
1c256f70 1044
01951dda 1045 def format_traceback(self):
497d2fab 1046 return join_nonempty(
1047 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1048 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1049 delim='\n') or None
01951dda 1050
1c256f70 1051
416c7fcb
PH
1052class UnsupportedError(ExtractorError):
1053 def __init__(self, url):
86e5f3ed 1054 super().__init__(
416c7fcb
PH
1055 'Unsupported URL: %s' % url, expected=True)
1056 self.url = url
1057
1058
55b3e45b
JMF
1059class RegexNotFoundError(ExtractorError):
1060 """Error when a regex didn't match"""
1061 pass
1062
1063
773f291d
S
1064class GeoRestrictedError(ExtractorError):
1065 """Geographic restriction Error exception.
1066
1067 This exception may be thrown when a video is not available from your
1068 geographic location due to geographic restrictions imposed by a website.
1069 """
b6e0c7d2 1070
0db3bae8 1071 def __init__(self, msg, countries=None, **kwargs):
1072 kwargs['expected'] = True
86e5f3ed 1073 super().__init__(msg, **kwargs)
773f291d
S
1074 self.countries = countries
1075
1076
bf5b9d85 1077class DownloadError(YoutubeDLError):
59ae15a5 1078 """Download Error exception.
d77c3dfd 1079
59ae15a5
PH
1080 This exception may be thrown by FileDownloader objects if they are not
1081 configured to continue on errors. They will contain the appropriate
1082 error message.
1083 """
5f6a1245 1084
8cc83b8d
FV
1085 def __init__(self, msg, exc_info=None):
1086 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1087 super().__init__(msg)
8cc83b8d 1088 self.exc_info = exc_info
d77c3dfd
FV
1089
1090
498f5606 1091class EntryNotInPlaylist(YoutubeDLError):
1092 """Entry not in playlist exception.
1093
1094 This exception will be thrown by YoutubeDL when a requested entry
1095 is not found in the playlist info_dict
1096 """
aa9369a2 1097 msg = 'Entry not found in info'
498f5606 1098
1099
bf5b9d85 1100class SameFileError(YoutubeDLError):
59ae15a5 1101 """Same File exception.
d77c3dfd 1102
59ae15a5
PH
1103 This exception will be thrown by FileDownloader objects if they detect
1104 multiple files would have to be downloaded to the same file on disk.
1105 """
aa9369a2 1106 msg = 'Fixed output name but more than one file to download'
1107
1108 def __init__(self, filename=None):
1109 if filename is not None:
1110 self.msg += f': {filename}'
1111 super().__init__(self.msg)
d77c3dfd
FV
1112
1113
bf5b9d85 1114class PostProcessingError(YoutubeDLError):
59ae15a5 1115 """Post Processing exception.
d77c3dfd 1116
59ae15a5
PH
1117 This exception may be raised by PostProcessor's .run() method to
1118 indicate an error in the postprocessing task.
1119 """
5f6a1245 1120
5f6a1245 1121
48f79687 1122class DownloadCancelled(YoutubeDLError):
1123 """ Exception raised when the download queue should be interrupted """
1124 msg = 'The download was cancelled'
8b0d7497 1125
8b0d7497 1126
48f79687 1127class ExistingVideoReached(DownloadCancelled):
1128 """ --break-on-existing triggered """
1129 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1130
48f79687 1131
1132class RejectedVideoReached(DownloadCancelled):
1133 """ --break-on-reject triggered """
1134 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1135
1136
48f79687 1137class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1138 """ --max-downloads limit has been reached. """
48f79687 1139 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1140
1141
f2ebc5c7 1142class ReExtractInfo(YoutubeDLError):
1143 """ Video info needs to be re-extracted. """
1144
1145 def __init__(self, msg, expected=False):
1146 super().__init__(msg)
1147 self.expected = expected
1148
1149
1150class ThrottledDownload(ReExtractInfo):
48f79687 1151 """ Download speed below --throttled-rate. """
aa9369a2 1152 msg = 'The download speed is below throttle limit'
d77c3dfd 1153
43b22906 1154 def __init__(self):
1155 super().__init__(self.msg, expected=False)
f2ebc5c7 1156
d77c3dfd 1157
bf5b9d85 1158class UnavailableVideoError(YoutubeDLError):
59ae15a5 1159 """Unavailable Format exception.
d77c3dfd 1160
59ae15a5
PH
1161 This exception will be thrown when a video is requested
1162 in a format that is not available for that video.
1163 """
aa9369a2 1164 msg = 'Unable to download video'
1165
1166 def __init__(self, err=None):
1167 if err is not None:
1168 self.msg += f': {err}'
1169 super().__init__(self.msg)
d77c3dfd
FV
1170
1171
bf5b9d85 1172class ContentTooShortError(YoutubeDLError):
59ae15a5 1173 """Content Too Short exception.
d77c3dfd 1174
59ae15a5
PH
1175 This exception may be raised by FileDownloader objects when a file they
1176 download is too small for what the server announced first, indicating
1177 the connection was probably interrupted.
1178 """
d77c3dfd 1179
59ae15a5 1180 def __init__(self, downloaded, expected):
86e5f3ed 1181 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1182 # Both in bytes
59ae15a5
PH
1183 self.downloaded = downloaded
1184 self.expected = expected
d77c3dfd 1185
5f6a1245 1186
bf5b9d85 1187class XAttrMetadataError(YoutubeDLError):
efa97bdc 1188 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1189 super().__init__(msg)
efa97bdc 1190 self.code = code
bd264412 1191 self.msg = msg
efa97bdc
YCH
1192
1193 # Parsing code and msg
3089bc74 1194 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1195 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1196 self.reason = 'NO_SPACE'
1197 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1198 self.reason = 'VALUE_TOO_LONG'
1199 else:
1200 self.reason = 'NOT_SUPPORTED'
1201
1202
bf5b9d85 1203class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1204 pass
1205
1206
c5a59d93 1207def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1208 hc = http_class(*args, **kwargs)
be4a824d 1209 source_address = ydl_handler._params.get('source_address')
8959018a 1210
be4a824d 1211 if source_address is not None:
8959018a
AU
1212 # This is to workaround _create_connection() from socket where it will try all
1213 # address data from getaddrinfo() including IPv6. This filters the result from
1214 # getaddrinfo() based on the source_address value.
1215 # This is based on the cpython socket.create_connection() function.
1216 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1217 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1218 host, port = address
1219 err = None
1220 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1221 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1222 ip_addrs = [addr for addr in addrs if addr[0] == af]
1223 if addrs and not ip_addrs:
1224 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1225 raise OSError(
9e21e6d9
S
1226 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1227 % (ip_version, source_address[0]))
8959018a
AU
1228 for res in ip_addrs:
1229 af, socktype, proto, canonname, sa = res
1230 sock = None
1231 try:
1232 sock = socket.socket(af, socktype, proto)
1233 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1234 sock.settimeout(timeout)
1235 sock.bind(source_address)
1236 sock.connect(sa)
1237 err = None # Explicitly break reference cycle
1238 return sock
86e5f3ed 1239 except OSError as _:
8959018a
AU
1240 err = _
1241 if sock is not None:
1242 sock.close()
1243 if err is not None:
1244 raise err
1245 else:
86e5f3ed 1246 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1247 if hasattr(hc, '_create_connection'):
1248 hc._create_connection = _create_connection
cfb0511d 1249 hc.source_address = (source_address, 0)
be4a824d
PH
1250
1251 return hc
1252
1253
87f0e62d 1254def handle_youtubedl_headers(headers):
992fc9d6
YCH
1255 filtered_headers = headers
1256
1257 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1258 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1259 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1260
992fc9d6 1261 return filtered_headers
87f0e62d
YCH
1262
1263
ac668111 1264class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1265 """Handler for HTTP requests and responses.
1266
1267 This class, when installed with an OpenerDirector, automatically adds
1268 the standard headers to every HTTP request and handles gzipped and
1269 deflated responses from web servers. If compression is to be avoided in
1270 a particular request, the original request in the program code only has
0424ec30 1271 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1272 removed before making the real request.
1273
1274 Part of this code was copied from:
1275
1276 http://techknack.net/python-urllib2-handlers/
1277
1278 Andrew Rowls, the author of that code, agreed to release it to the
1279 public domain.
1280 """
1281
be4a824d 1282 def __init__(self, params, *args, **kwargs):
ac668111 1283 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1284 self._params = params
1285
1286 def http_open(self, req):
ac668111 1287 conn_class = http.client.HTTPConnection
71aff188
YCH
1288
1289 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1290 if socks_proxy:
1291 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1292 del req.headers['Ytdl-socks-proxy']
1293
be4a824d 1294 return self.do_open(functools.partial(
71aff188 1295 _create_http_connection, self, conn_class, False),
be4a824d
PH
1296 req)
1297
59ae15a5
PH
1298 @staticmethod
1299 def deflate(data):
fc2119f2 1300 if not data:
1301 return data
59ae15a5
PH
1302 try:
1303 return zlib.decompress(data, -zlib.MAX_WBITS)
1304 except zlib.error:
1305 return zlib.decompress(data)
1306
4390d5ec 1307 @staticmethod
1308 def brotli(data):
1309 if not data:
1310 return data
9b8ee23b 1311 return brotli.decompress(data)
4390d5ec 1312
acebc9cd 1313 def http_request(self, req):
51f267d9
S
1314 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1315 # always respected by websites, some tend to give out URLs with non percent-encoded
1316 # non-ASCII characters (see telemb.py, ard.py [#3412])
1317 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1318 # To work around aforementioned issue we will replace request's original URL with
1319 # percent-encoded one
1320 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1321 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1322 url = req.get_full_url()
1323 url_escaped = escape_url(url)
1324
1325 # Substitute URL if any change after escaping
1326 if url != url_escaped:
15d260eb 1327 req = update_Request(req, url=url_escaped)
51f267d9 1328
8b7539d2 1329 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1330 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1331 # The dict keys are capitalized because of this bug by urllib
1332 if h.capitalize() not in req.headers:
33ac271b 1333 req.add_header(h, v)
87f0e62d 1334
af14914b 1335 if 'Accept-encoding' not in req.headers:
1336 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1337
87f0e62d 1338 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1339
379a4f16 1340 return super().do_request_(req)
59ae15a5 1341
acebc9cd 1342 def http_response(self, req, resp):
59ae15a5
PH
1343 old_resp = resp
1344 # gzip
1345 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1346 content = resp.read()
1347 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1348 try:
1349 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1350 except OSError as original_ioerror:
aa3e9507
PH
1351 # There may be junk add the end of the file
1352 # See http://stackoverflow.com/q/4928560/35070 for details
1353 for i in range(1, 1024):
1354 try:
1355 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1356 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1357 except OSError:
aa3e9507
PH
1358 continue
1359 break
1360 else:
1361 raise original_ioerror
ac668111 1362 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1363 resp.msg = old_resp.msg
c047270c 1364 del resp.headers['Content-encoding']
59ae15a5
PH
1365 # deflate
1366 if resp.headers.get('Content-encoding', '') == 'deflate':
1367 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1368 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1369 resp.msg = old_resp.msg
c047270c 1370 del resp.headers['Content-encoding']
4390d5ec 1371 # brotli
1372 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1373 resp = urllib.request.addinfourl(
4390d5ec 1374 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1375 resp.msg = old_resp.msg
1376 del resp.headers['Content-encoding']
ad729172 1377 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1378 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1379 if 300 <= resp.code < 400:
1380 location = resp.headers.get('Location')
1381 if location:
1382 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1383 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1384 location_escaped = escape_url(location)
1385 if location != location_escaped:
1386 del resp.headers['Location']
1387 resp.headers['Location'] = location_escaped
59ae15a5 1388 return resp
0f8d03f8 1389
acebc9cd
PH
1390 https_request = http_request
1391 https_response = http_response
bf50b038 1392
5de90176 1393
71aff188
YCH
1394def make_socks_conn_class(base_class, socks_proxy):
1395 assert issubclass(base_class, (
ac668111 1396 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188
YCH
1397
1398 url_components = compat_urlparse.urlparse(socks_proxy)
1399 if url_components.scheme.lower() == 'socks5':
1400 socks_type = ProxyType.SOCKS5
1401 elif url_components.scheme.lower() in ('socks', 'socks4'):
1402 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1403 elif url_components.scheme.lower() == 'socks4a':
1404 socks_type = ProxyType.SOCKS4A
71aff188 1405
cdd94c2e
YCH
1406 def unquote_if_non_empty(s):
1407 if not s:
1408 return s
ac668111 1409 return urllib.parse.unquote_plus(s)
cdd94c2e 1410
71aff188
YCH
1411 proxy_args = (
1412 socks_type,
1413 url_components.hostname, url_components.port or 1080,
1414 True, # Remote DNS
cdd94c2e
YCH
1415 unquote_if_non_empty(url_components.username),
1416 unquote_if_non_empty(url_components.password),
71aff188
YCH
1417 )
1418
1419 class SocksConnection(base_class):
1420 def connect(self):
1421 self.sock = sockssocket()
1422 self.sock.setproxy(*proxy_args)
19a03940 1423 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1424 self.sock.settimeout(self.timeout)
1425 self.sock.connect((self.host, self.port))
1426
ac668111 1427 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1428 if hasattr(self, '_context'): # Python > 2.6
1429 self.sock = self._context.wrap_socket(
1430 self.sock, server_hostname=self.host)
1431 else:
1432 self.sock = ssl.wrap_socket(self.sock)
1433
1434 return SocksConnection
1435
1436
ac668111 1437class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1438 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1439 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1440 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1441 self._params = params
1442
1443 def https_open(self, req):
4f264c02 1444 kwargs = {}
71aff188
YCH
1445 conn_class = self._https_conn_class
1446
4f264c02
JMF
1447 if hasattr(self, '_context'): # python > 2.6
1448 kwargs['context'] = self._context
1449 if hasattr(self, '_check_hostname'): # python 3.x
1450 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1451
1452 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1453 if socks_proxy:
1454 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1455 del req.headers['Ytdl-socks-proxy']
1456
4f28b537 1457 try:
1458 return self.do_open(
1459 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1460 except urllib.error.URLError as e:
1461 if (isinstance(e.reason, ssl.SSLError)
1462 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1463 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1464 raise
be4a824d
PH
1465
1466
ac668111 1467class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1468 """
1469 See [1] for cookie file format.
1470
1471 1. https://curl.haxx.se/docs/http-cookies.html
1472 """
e7e62441 1473 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1474 _ENTRY_LEN = 7
1475 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1476# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1477
1478'''
1479 _CookieFileEntry = collections.namedtuple(
1480 'CookieFileEntry',
1481 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1482
d76fa1f3 1483 def __init__(self, filename=None, *args, **kwargs):
1484 super().__init__(None, *args, **kwargs)
1485 if self.is_path(filename):
1486 filename = os.fspath(filename)
1487 self.filename = filename
1488
24146491 1489 @staticmethod
1490 def _true_or_false(cndn):
1491 return 'TRUE' if cndn else 'FALSE'
1492
d76fa1f3 1493 @staticmethod
1494 def is_path(file):
1495 return isinstance(file, (str, bytes, os.PathLike))
1496
1497 @contextlib.contextmanager
1498 def open(self, file, *, write=False):
1499 if self.is_path(file):
1500 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1501 yield f
1502 else:
1503 if write:
1504 file.truncate(0)
1505 yield file
1506
24146491 1507 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1508 now = time.time()
1509 for cookie in self:
1510 if (not ignore_discard and cookie.discard
1511 or not ignore_expires and cookie.is_expired(now)):
1512 continue
1513 name, value = cookie.name, cookie.value
1514 if value is None:
1515 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1516 # with no name, whereas http.cookiejar regards it as a
1517 # cookie with no value.
1518 name, value = '', name
1519 f.write('%s\n' % '\t'.join((
1520 cookie.domain,
1521 self._true_or_false(cookie.domain.startswith('.')),
1522 cookie.path,
1523 self._true_or_false(cookie.secure),
1524 str_or_none(cookie.expires, default=''),
1525 name, value
1526 )))
1527
1528 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1529 """
1530 Save cookies to a file.
24146491 1531 Code is taken from CPython 3.6
1532 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1533
c380cc28
S
1534 if filename is None:
1535 if self.filename is not None:
1536 filename = self.filename
1537 else:
ac668111 1538 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1539
24146491 1540 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1541 for cookie in self:
1542 if cookie.expires is None:
1543 cookie.expires = 0
c380cc28 1544
d76fa1f3 1545 with self.open(filename, write=True) as f:
c380cc28 1546 f.write(self._HEADER)
24146491 1547 self._really_save(f, *args, **kwargs)
1bab3437
S
1548
1549 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1550 """Load cookies from a file."""
1551 if filename is None:
1552 if self.filename is not None:
1553 filename = self.filename
1554 else:
ac668111 1555 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1556
c380cc28
S
1557 def prepare_line(line):
1558 if line.startswith(self._HTTPONLY_PREFIX):
1559 line = line[len(self._HTTPONLY_PREFIX):]
1560 # comments and empty lines are fine
1561 if line.startswith('#') or not line.strip():
1562 return line
1563 cookie_list = line.split('\t')
1564 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1565 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1566 cookie = self._CookieFileEntry(*cookie_list)
1567 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1568 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1569 return line
1570
e7e62441 1571 cf = io.StringIO()
d76fa1f3 1572 with self.open(filename) as f:
e7e62441 1573 for line in f:
c380cc28
S
1574 try:
1575 cf.write(prepare_line(line))
ac668111 1576 except http.cookiejar.LoadError as e:
94aa0644 1577 if f'{line.strip()} '[0] in '[{"':
ac668111 1578 raise http.cookiejar.LoadError(
94aa0644
L
1579 'Cookies file must be Netscape formatted, not JSON. See '
1580 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1581 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1582 continue
e7e62441 1583 cf.seek(0)
1584 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1585 # Session cookies are denoted by either `expires` field set to
1586 # an empty string or 0. MozillaCookieJar only recognizes the former
1587 # (see [1]). So we need force the latter to be recognized as session
1588 # cookies on our own.
1589 # Session cookies may be important for cookies-based authentication,
1590 # e.g. usually, when user does not check 'Remember me' check box while
1591 # logging in on a site, some important cookies are stored as session
1592 # cookies so that not recognizing them will result in failed login.
1593 # 1. https://bugs.python.org/issue17164
1594 for cookie in self:
1595 # Treat `expires=0` cookies as session cookies
1596 if cookie.expires == 0:
1597 cookie.expires = None
1598 cookie.discard = True
1599
1600
ac668111 1601class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1602 def __init__(self, cookiejar=None):
ac668111 1603 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1604
1605 def http_response(self, request, response):
ac668111 1606 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1607
ac668111 1608 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1609 https_response = http_response
1610
1611
ac668111 1612class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1613 """YoutubeDL redirect handler
1614
1615 The code is based on HTTPRedirectHandler implementation from CPython [1].
1616
1617 This redirect handler solves two issues:
1618 - ensures redirect URL is always unicode under python 2
1619 - introduces support for experimental HTTP response status code
1620 308 Permanent Redirect [2] used by some sites [3]
1621
1622 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1623 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1624 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1625 """
1626
ac668111 1627 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1628
1629 def redirect_request(self, req, fp, code, msg, headers, newurl):
1630 """Return a Request or None in response to a redirect.
1631
1632 This is called by the http_error_30x methods when a
1633 redirection response is received. If a redirection should
1634 take place, return a new Request to allow http_error_30x to
1635 perform the redirect. Otherwise, raise HTTPError if no-one
1636 else should try to handle this url. Return None if you can't
1637 but another Handler might.
1638 """
1639 m = req.get_method()
1640 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1641 or code in (301, 302, 303) and m == "POST")):
1642 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1643 # Strictly (according to RFC 2616), 301 or 302 in response to
1644 # a POST MUST NOT cause a redirection without confirmation
1645 # from the user (of urllib.request, in this case). In practice,
1646 # essentially all clients do redirect in this case, so we do
1647 # the same.
1648
201c1459 1649 # Be conciliant with URIs containing a space. This is mainly
1650 # redundant with the more complete encoding done in http_error_302(),
1651 # but it is kept for compatibility with other callers.
1652 newurl = newurl.replace(' ', '%20')
1653
1654 CONTENT_HEADERS = ("content-length", "content-type")
1655 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1656 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1657
1658 # A 303 must either use GET or HEAD for subsequent request
1659 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1660 if code == 303 and m != 'HEAD':
1661 m = 'GET'
1662 # 301 and 302 redirects are commonly turned into a GET from a POST
1663 # for subsequent requests by browsers, so we'll do the same.
1664 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1665 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1666 if code in (301, 302) and m == 'POST':
1667 m = 'GET'
1668
ac668111 1669 return urllib.request.Request(
201c1459 1670 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1671 unverifiable=True, method=m)
fca6dba8
S
1672
1673
46f59e89
S
1674def extract_timezone(date_str):
1675 m = re.search(
f137e4c2 1676 r'''(?x)
1677 ^.{8,}? # >=8 char non-TZ prefix, if present
1678 (?P<tz>Z| # just the UTC Z, or
1679 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1680 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1681 [ ]? # optional space
1682 (?P<sign>\+|-) # +/-
1683 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1684 $)
1685 ''', date_str)
46f59e89
S
1686 if not m:
1687 timezone = datetime.timedelta()
1688 else:
1689 date_str = date_str[:-len(m.group('tz'))]
1690 if not m.group('sign'):
1691 timezone = datetime.timedelta()
1692 else:
1693 sign = 1 if m.group('sign') == '+' else -1
1694 timezone = datetime.timedelta(
1695 hours=sign * int(m.group('hours')),
1696 minutes=sign * int(m.group('minutes')))
1697 return timezone, date_str
1698
1699
08b38d54 1700def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1701 """ Return a UNIX timestamp from the given date """
1702
1703 if date_str is None:
1704 return None
1705
52c3a6e4
S
1706 date_str = re.sub(r'\.[0-9]+', '', date_str)
1707
08b38d54 1708 if timezone is None:
46f59e89
S
1709 timezone, date_str = extract_timezone(date_str)
1710
19a03940 1711 with contextlib.suppress(ValueError):
86e5f3ed 1712 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1713 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1714 return calendar.timegm(dt.timetuple())
912b38b4
PH
1715
1716
46f59e89
S
1717def date_formats(day_first=True):
1718 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1719
1720
42bdd9d0 1721def unified_strdate(date_str, day_first=True):
bf50b038 1722 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1723
1724 if date_str is None:
1725 return None
bf50b038 1726 upload_date = None
5f6a1245 1727 # Replace commas
026fcc04 1728 date_str = date_str.replace(',', ' ')
42bdd9d0 1729 # Remove AM/PM + timezone
9bb8e0a3 1730 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1731 _, date_str = extract_timezone(date_str)
42bdd9d0 1732
46f59e89 1733 for expression in date_formats(day_first):
19a03940 1734 with contextlib.suppress(ValueError):
bf50b038 1735 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1736 if upload_date is None:
1737 timetuple = email.utils.parsedate_tz(date_str)
1738 if timetuple:
19a03940 1739 with contextlib.suppress(ValueError):
c6b9cf05 1740 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
1741 if upload_date is not None:
1742 return compat_str(upload_date)
bf50b038 1743
5f6a1245 1744
46f59e89
S
1745def unified_timestamp(date_str, day_first=True):
1746 if date_str is None:
1747 return None
1748
2ae2ffda 1749 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1750
7dc2a74e 1751 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1752 timezone, date_str = extract_timezone(date_str)
1753
1754 # Remove AM/PM + timezone
1755 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1756
deef3195
S
1757 # Remove unrecognized timezones from ISO 8601 alike timestamps
1758 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1759 if m:
1760 date_str = date_str[:-len(m.group('tz'))]
1761
f226880c
PH
1762 # Python only supports microseconds, so remove nanoseconds
1763 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1764 if m:
1765 date_str = m.group(1)
1766
46f59e89 1767 for expression in date_formats(day_first):
19a03940 1768 with contextlib.suppress(ValueError):
7dc2a74e 1769 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1770 return calendar.timegm(dt.timetuple())
46f59e89
S
1771 timetuple = email.utils.parsedate_tz(date_str)
1772 if timetuple:
7dc2a74e 1773 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1774
1775
28e614de 1776def determine_ext(url, default_ext='unknown_video'):
85750f89 1777 if url is None or '.' not in url:
f4776371 1778 return default_ext
9cb9a5df 1779 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1780 if re.match(r'^[A-Za-z0-9]+$', guess):
1781 return guess
a7aaa398
S
1782 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1783 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1784 return guess.rstrip('/')
73e79f2a 1785 else:
cbdbb766 1786 return default_ext
73e79f2a 1787
5f6a1245 1788
824fa511
S
1789def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1790 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1791
5f6a1245 1792
9e62f283 1793def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1794 R"""
1795 Return a datetime object from a string.
1796 Supported format:
1797 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1798
1799 @param format strftime format of DATE
1800 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1801 auto: round to the unit provided in date_str (if applicable).
9e62f283 1802 """
1803 auto_precision = False
1804 if precision == 'auto':
1805 auto_precision = True
1806 precision = 'microsecond'
396a76f7 1807 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1808 if date_str in ('now', 'today'):
37254abc 1809 return today
f8795e10
PH
1810 if date_str == 'yesterday':
1811 return today - datetime.timedelta(days=1)
9e62f283 1812 match = re.match(
3d38b2d6 1813 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1814 date_str)
37254abc 1815 if match is not None:
9e62f283 1816 start_time = datetime_from_str(match.group('start'), precision, format)
1817 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1818 unit = match.group('unit')
9e62f283 1819 if unit == 'month' or unit == 'year':
1820 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1821 unit = 'day'
9e62f283 1822 else:
1823 if unit == 'week':
1824 unit = 'day'
1825 time *= 7
1826 delta = datetime.timedelta(**{unit + 's': time})
1827 new_date = start_time + delta
1828 if auto_precision:
1829 return datetime_round(new_date, unit)
1830 return new_date
1831
1832 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1833
1834
d49f8db3 1835def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1836 R"""
1837 Return a date object from a string using datetime_from_str
9e62f283 1838
3d38b2d6 1839 @param strict Restrict allowed patterns to "YYYYMMDD" and
1840 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1841 """
3d38b2d6 1842 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1843 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1844 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1845
1846
1847def datetime_add_months(dt, months):
1848 """Increment/Decrement a datetime object by months."""
1849 month = dt.month + months - 1
1850 year = dt.year + month // 12
1851 month = month % 12 + 1
1852 day = min(dt.day, calendar.monthrange(year, month)[1])
1853 return dt.replace(year, month, day)
1854
1855
1856def datetime_round(dt, precision='day'):
1857 """
1858 Round a datetime object's time to a specific precision
1859 """
1860 if precision == 'microsecond':
1861 return dt
1862
1863 unit_seconds = {
1864 'day': 86400,
1865 'hour': 3600,
1866 'minute': 60,
1867 'second': 1,
1868 }
1869 roundto = lambda x, n: ((x + n / 2) // n) * n
1870 timestamp = calendar.timegm(dt.timetuple())
1871 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1872
1873
e63fc1be 1874def hyphenate_date(date_str):
1875 """
1876 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1877 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1878 if match is not None:
1879 return '-'.join(match.groups())
1880 else:
1881 return date_str
1882
5f6a1245 1883
86e5f3ed 1884class DateRange:
bd558525 1885 """Represents a time interval between two dates"""
5f6a1245 1886
bd558525
JMF
1887 def __init__(self, start=None, end=None):
1888 """start and end must be strings in the format accepted by date"""
1889 if start is not None:
d49f8db3 1890 self.start = date_from_str(start, strict=True)
bd558525
JMF
1891 else:
1892 self.start = datetime.datetime.min.date()
1893 if end is not None:
d49f8db3 1894 self.end = date_from_str(end, strict=True)
bd558525
JMF
1895 else:
1896 self.end = datetime.datetime.max.date()
37254abc 1897 if self.start > self.end:
bd558525 1898 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1899
bd558525
JMF
1900 @classmethod
1901 def day(cls, day):
1902 """Returns a range that only contains the given day"""
5f6a1245
JW
1903 return cls(day, day)
1904
bd558525
JMF
1905 def __contains__(self, date):
1906 """Check if the date is in the range"""
37254abc
JMF
1907 if not isinstance(date, datetime.date):
1908 date = date_from_str(date)
1909 return self.start <= date <= self.end
5f6a1245 1910
bd558525 1911 def __str__(self):
86e5f3ed 1912 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96
PH
1913
1914
1915def platform_name():
1916 """ Returns the platform name as a compat_str """
1917 res = platform.platform()
1918 if isinstance(res, bytes):
1919 res = res.decode(preferredencoding())
1920
1921 assert isinstance(res, compat_str)
1922 return res
c257baff
PH
1923
1924
0b9c08b4 1925@functools.cache
49fa4d9a 1926def get_windows_version():
8a82af35 1927 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1928 if compat_os_name == 'nt':
1929 return version_tuple(platform.win32_ver()[1])
1930 else:
8a82af35 1931 return ()
49fa4d9a
N
1932
1933
734f90bb 1934def write_string(s, out=None, encoding=None):
19a03940 1935 assert isinstance(s, str)
1936 out = out or sys.stderr
7459e3a2 1937
fe1daad3 1938 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1939 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1940
8a82af35 1941 enc, buffer = None, out
cfb0511d 1942 if 'b' in getattr(out, 'mode', ''):
c487cf00 1943 enc = encoding or preferredencoding()
104aa738 1944 elif hasattr(out, 'buffer'):
8a82af35 1945 buffer = out.buffer
104aa738 1946 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1947
8a82af35 1948 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1949 out.flush()
1950
1951
48ea9cea
PH
1952def bytes_to_intlist(bs):
1953 if not bs:
1954 return []
1955 if isinstance(bs[0], int): # Python 3
1956 return list(bs)
1957 else:
1958 return [ord(c) for c in bs]
1959
c257baff 1960
cba892fa 1961def intlist_to_bytes(xs):
1962 if not xs:
1963 return b''
ac668111 1964 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
1965
1966
8a82af35 1967class LockingUnsupportedError(OSError):
1890fc63 1968 msg = 'File locking is not supported'
0edb3e33 1969
1970 def __init__(self):
1971 super().__init__(self.msg)
1972
1973
c1c9a79c
PH
1974# Cross-platform file locking
1975if sys.platform == 'win32':
1976 import ctypes.wintypes
1977 import msvcrt
1978
1979 class OVERLAPPED(ctypes.Structure):
1980 _fields_ = [
1981 ('Internal', ctypes.wintypes.LPVOID),
1982 ('InternalHigh', ctypes.wintypes.LPVOID),
1983 ('Offset', ctypes.wintypes.DWORD),
1984 ('OffsetHigh', ctypes.wintypes.DWORD),
1985 ('hEvent', ctypes.wintypes.HANDLE),
1986 ]
1987
1988 kernel32 = ctypes.windll.kernel32
1989 LockFileEx = kernel32.LockFileEx
1990 LockFileEx.argtypes = [
1991 ctypes.wintypes.HANDLE, # hFile
1992 ctypes.wintypes.DWORD, # dwFlags
1993 ctypes.wintypes.DWORD, # dwReserved
1994 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1995 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1996 ctypes.POINTER(OVERLAPPED) # Overlapped
1997 ]
1998 LockFileEx.restype = ctypes.wintypes.BOOL
1999 UnlockFileEx = kernel32.UnlockFileEx
2000 UnlockFileEx.argtypes = [
2001 ctypes.wintypes.HANDLE, # hFile
2002 ctypes.wintypes.DWORD, # dwReserved
2003 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2004 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2005 ctypes.POINTER(OVERLAPPED) # Overlapped
2006 ]
2007 UnlockFileEx.restype = ctypes.wintypes.BOOL
2008 whole_low = 0xffffffff
2009 whole_high = 0x7fffffff
2010
747c0bd1 2011 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2012 overlapped = OVERLAPPED()
2013 overlapped.Offset = 0
2014 overlapped.OffsetHigh = 0
2015 overlapped.hEvent = 0
2016 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2017
2018 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2019 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2020 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2021 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2022 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2023
2024 def _unlock_file(f):
2025 assert f._lock_file_overlapped_p
2026 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2027 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2028 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2029
2030else:
399a76e6
YCH
2031 try:
2032 import fcntl
c1c9a79c 2033
a3125791 2034 def _lock_file(f, exclusive, block):
b63837bc 2035 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2036 if not block:
2037 flags |= fcntl.LOCK_NB
acea8d7c 2038 try:
b63837bc 2039 fcntl.flock(f, flags)
acea8d7c
JK
2040 except BlockingIOError:
2041 raise
2042 except OSError: # AOSP does not have flock()
b63837bc 2043 fcntl.lockf(f, flags)
c1c9a79c 2044
399a76e6 2045 def _unlock_file(f):
acea8d7c
JK
2046 try:
2047 fcntl.flock(f, fcntl.LOCK_UN)
2048 except OSError:
2049 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2050
399a76e6 2051 except ImportError:
399a76e6 2052
a3125791 2053 def _lock_file(f, exclusive, block):
0edb3e33 2054 raise LockingUnsupportedError()
399a76e6
YCH
2055
2056 def _unlock_file(f):
0edb3e33 2057 raise LockingUnsupportedError()
c1c9a79c
PH
2058
2059
86e5f3ed 2060class locked_file:
0edb3e33 2061 locked = False
747c0bd1 2062
a3125791 2063 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2064 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2065 raise NotImplementedError(mode)
2066 self.mode, self.block = mode, block
2067
2068 writable = any(f in mode for f in 'wax+')
2069 readable = any(f in mode for f in 'r+')
2070 flags = functools.reduce(operator.ior, (
2071 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2072 getattr(os, 'O_BINARY', 0), # Windows only
2073 getattr(os, 'O_NOINHERIT', 0), # Windows only
2074 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2075 os.O_APPEND if 'a' in mode else 0,
2076 os.O_EXCL if 'x' in mode else 0,
2077 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2078 ))
2079
98804d03 2080 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2081
2082 def __enter__(self):
a3125791 2083 exclusive = 'r' not in self.mode
c1c9a79c 2084 try:
a3125791 2085 _lock_file(self.f, exclusive, self.block)
0edb3e33 2086 self.locked = True
86e5f3ed 2087 except OSError:
c1c9a79c
PH
2088 self.f.close()
2089 raise
fcfa8853 2090 if 'w' in self.mode:
131e14dc
JK
2091 try:
2092 self.f.truncate()
2093 except OSError as e:
1890fc63 2094 if e.errno not in (
2095 errno.ESPIPE, # Illegal seek - expected for FIFO
2096 errno.EINVAL, # Invalid argument - expected for /dev/null
2097 ):
2098 raise
c1c9a79c
PH
2099 return self
2100
0edb3e33 2101 def unlock(self):
2102 if not self.locked:
2103 return
c1c9a79c 2104 try:
0edb3e33 2105 _unlock_file(self.f)
c1c9a79c 2106 finally:
0edb3e33 2107 self.locked = False
c1c9a79c 2108
0edb3e33 2109 def __exit__(self, *_):
2110 try:
2111 self.unlock()
2112 finally:
2113 self.f.close()
4eb7f1d1 2114
0edb3e33 2115 open = __enter__
2116 close = __exit__
a3125791 2117
0edb3e33 2118 def __getattr__(self, attr):
2119 return getattr(self.f, attr)
a3125791 2120
0edb3e33 2121 def __iter__(self):
2122 return iter(self.f)
a3125791 2123
4eb7f1d1 2124
0b9c08b4 2125@functools.cache
4644ac55
S
2126def get_filesystem_encoding():
2127 encoding = sys.getfilesystemencoding()
2128 return encoding if encoding is not None else 'utf-8'
2129
2130
4eb7f1d1 2131def shell_quote(args):
a6a173c2 2132 quoted_args = []
4644ac55 2133 encoding = get_filesystem_encoding()
a6a173c2
JMF
2134 for a in args:
2135 if isinstance(a, bytes):
2136 # We may get a filename encoded with 'encodeFilename'
2137 a = a.decode(encoding)
aefce8e6 2138 quoted_args.append(compat_shlex_quote(a))
28e614de 2139 return ' '.join(quoted_args)
9d4660ca
PH
2140
2141
2142def smuggle_url(url, data):
2143 """ Pass additional data in a URL for internal use. """
2144
81953d1a
RA
2145 url, idata = unsmuggle_url(url, {})
2146 data.update(idata)
15707c7e 2147 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2148 {'__youtubedl_smuggle': json.dumps(data)})
2149 return url + '#' + sdata
9d4660ca
PH
2150
2151
79f82953 2152def unsmuggle_url(smug_url, default=None):
83e865a3 2153 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2154 return smug_url, default
28e614de
PH
2155 url, _, sdata = smug_url.rpartition('#')
2156 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2157 data = json.loads(jsond)
2158 return url, data
02dbf93f
PH
2159
2160
e0fd9573 2161def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2162 """ Formats numbers with decimal sufixes like K, M, etc """
2163 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2164 if num is None or num < 0:
e0fd9573 2165 return None
eeb2a770 2166 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2167 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2168 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2169 if factor == 1024:
2170 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2171 converted = num / (factor ** exponent)
abbeeebc 2172 return fmt % (converted, suffix)
e0fd9573 2173
2174
02dbf93f 2175def format_bytes(bytes):
f02d24d8 2176 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2177
1c088fa8 2178
fb47597b
S
2179def lookup_unit_table(unit_table, s):
2180 units_re = '|'.join(re.escape(u) for u in unit_table)
2181 m = re.match(
782b1b5b 2182 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2183 if not m:
2184 return None
2185 num_str = m.group('num').replace(',', '.')
2186 mult = unit_table[m.group('unit')]
2187 return int(float(num_str) * mult)
2188
2189
be64b5b0
PH
2190def parse_filesize(s):
2191 if s is None:
2192 return None
2193
dfb1b146 2194 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2195 # but we support those too
2196 _UNIT_TABLE = {
2197 'B': 1,
2198 'b': 1,
70852b47 2199 'bytes': 1,
be64b5b0
PH
2200 'KiB': 1024,
2201 'KB': 1000,
2202 'kB': 1024,
2203 'Kb': 1000,
13585d76 2204 'kb': 1000,
70852b47
YCH
2205 'kilobytes': 1000,
2206 'kibibytes': 1024,
be64b5b0
PH
2207 'MiB': 1024 ** 2,
2208 'MB': 1000 ** 2,
2209 'mB': 1024 ** 2,
2210 'Mb': 1000 ** 2,
13585d76 2211 'mb': 1000 ** 2,
70852b47
YCH
2212 'megabytes': 1000 ** 2,
2213 'mebibytes': 1024 ** 2,
be64b5b0
PH
2214 'GiB': 1024 ** 3,
2215 'GB': 1000 ** 3,
2216 'gB': 1024 ** 3,
2217 'Gb': 1000 ** 3,
13585d76 2218 'gb': 1000 ** 3,
70852b47
YCH
2219 'gigabytes': 1000 ** 3,
2220 'gibibytes': 1024 ** 3,
be64b5b0
PH
2221 'TiB': 1024 ** 4,
2222 'TB': 1000 ** 4,
2223 'tB': 1024 ** 4,
2224 'Tb': 1000 ** 4,
13585d76 2225 'tb': 1000 ** 4,
70852b47
YCH
2226 'terabytes': 1000 ** 4,
2227 'tebibytes': 1024 ** 4,
be64b5b0
PH
2228 'PiB': 1024 ** 5,
2229 'PB': 1000 ** 5,
2230 'pB': 1024 ** 5,
2231 'Pb': 1000 ** 5,
13585d76 2232 'pb': 1000 ** 5,
70852b47
YCH
2233 'petabytes': 1000 ** 5,
2234 'pebibytes': 1024 ** 5,
be64b5b0
PH
2235 'EiB': 1024 ** 6,
2236 'EB': 1000 ** 6,
2237 'eB': 1024 ** 6,
2238 'Eb': 1000 ** 6,
13585d76 2239 'eb': 1000 ** 6,
70852b47
YCH
2240 'exabytes': 1000 ** 6,
2241 'exbibytes': 1024 ** 6,
be64b5b0
PH
2242 'ZiB': 1024 ** 7,
2243 'ZB': 1000 ** 7,
2244 'zB': 1024 ** 7,
2245 'Zb': 1000 ** 7,
13585d76 2246 'zb': 1000 ** 7,
70852b47
YCH
2247 'zettabytes': 1000 ** 7,
2248 'zebibytes': 1024 ** 7,
be64b5b0
PH
2249 'YiB': 1024 ** 8,
2250 'YB': 1000 ** 8,
2251 'yB': 1024 ** 8,
2252 'Yb': 1000 ** 8,
13585d76 2253 'yb': 1000 ** 8,
70852b47
YCH
2254 'yottabytes': 1000 ** 8,
2255 'yobibytes': 1024 ** 8,
be64b5b0
PH
2256 }
2257
fb47597b
S
2258 return lookup_unit_table(_UNIT_TABLE, s)
2259
2260
2261def parse_count(s):
2262 if s is None:
be64b5b0
PH
2263 return None
2264
352d5da8 2265 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2266
2267 if re.match(r'^[\d,.]+$', s):
2268 return str_to_int(s)
2269
2270 _UNIT_TABLE = {
2271 'k': 1000,
2272 'K': 1000,
2273 'm': 1000 ** 2,
2274 'M': 1000 ** 2,
2275 'kk': 1000 ** 2,
2276 'KK': 1000 ** 2,
352d5da8 2277 'b': 1000 ** 3,
2278 'B': 1000 ** 3,
fb47597b 2279 }
be64b5b0 2280
352d5da8 2281 ret = lookup_unit_table(_UNIT_TABLE, s)
2282 if ret is not None:
2283 return ret
2284
2285 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2286 if mobj:
2287 return str_to_int(mobj.group(1))
be64b5b0 2288
2f7ae819 2289
5d45484c 2290def parse_resolution(s, *, lenient=False):
b871d7e9
S
2291 if s is None:
2292 return {}
2293
5d45484c
LNO
2294 if lenient:
2295 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2296 else:
2297 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2298 if mobj:
2299 return {
2300 'width': int(mobj.group('w')),
2301 'height': int(mobj.group('h')),
2302 }
2303
17ec8bcf 2304 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2305 if mobj:
2306 return {'height': int(mobj.group(1))}
2307
2308 mobj = re.search(r'\b([48])[kK]\b', s)
2309 if mobj:
2310 return {'height': int(mobj.group(1)) * 540}
2311
2312 return {}
2313
2314
0dc41787
S
2315def parse_bitrate(s):
2316 if not isinstance(s, compat_str):
2317 return
2318 mobj = re.search(r'\b(\d+)\s*kbps', s)
2319 if mobj:
2320 return int(mobj.group(1))
2321
2322
a942d6cb 2323def month_by_name(name, lang='en'):
caefb1de
PH
2324 """ Return the number of a month by (locale-independently) English name """
2325
f6717dec 2326 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2327
caefb1de 2328 try:
f6717dec 2329 return month_names.index(name) + 1
7105440c
YCH
2330 except ValueError:
2331 return None
2332
2333
2334def month_by_abbreviation(abbrev):
2335 """ Return the number of a month by (locale-independently) English
2336 abbreviations """
2337
2338 try:
2339 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2340 except ValueError:
2341 return None
18258362
JMF
2342
2343
5aafe895 2344def fix_xml_ampersands(xml_str):
18258362 2345 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2346 return re.sub(
2347 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2348 '&amp;',
5aafe895 2349 xml_str)
e3946f98
PH
2350
2351
2352def setproctitle(title):
8bf48f23 2353 assert isinstance(title, compat_str)
c1c05c67
YCH
2354
2355 # ctypes in Jython is not complete
2356 # http://bugs.jython.org/issue2148
2357 if sys.platform.startswith('java'):
2358 return
2359
e3946f98 2360 try:
611c1dd9 2361 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2362 except OSError:
2363 return
2f49bcd6
RC
2364 except TypeError:
2365 # LoadLibrary in Windows Python 2.7.13 only expects
2366 # a bytestring, but since unicode_literals turns
2367 # every string into a unicode string, it fails.
2368 return
0f06bcd7 2369 title_bytes = title.encode()
6eefe533
PH
2370 buf = ctypes.create_string_buffer(len(title_bytes))
2371 buf.value = title_bytes
e3946f98 2372 try:
6eefe533 2373 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2374 except AttributeError:
2375 return # Strange libc, just skip this
d7dda168
PH
2376
2377
2378def remove_start(s, start):
46bc9b7d 2379 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2380
2381
2b9faf55 2382def remove_end(s, end):
46bc9b7d 2383 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2384
2385
31b2051e
S
2386def remove_quotes(s):
2387 if s is None or len(s) < 2:
2388 return s
2389 for quote in ('"', "'", ):
2390 if s[0] == quote and s[-1] == quote:
2391 return s[1:-1]
2392 return s
2393
2394
b6e0c7d2
U
2395def get_domain(url):
2396 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2397 return domain.group('domain') if domain else None
2398
2399
29eb5174 2400def url_basename(url):
9b8aaeed 2401 path = compat_urlparse.urlparse(url).path
28e614de 2402 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2403
2404
02dc0a36
S
2405def base_url(url):
2406 return re.match(r'https?://[^?#&]+/', url).group()
2407
2408
e34c3361 2409def urljoin(base, path):
4b5de77b 2410 if isinstance(path, bytes):
0f06bcd7 2411 path = path.decode()
e34c3361
S
2412 if not isinstance(path, compat_str) or not path:
2413 return None
fad4ceb5 2414 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2415 return path
4b5de77b 2416 if isinstance(base, bytes):
0f06bcd7 2417 base = base.decode()
4b5de77b
S
2418 if not isinstance(base, compat_str) or not re.match(
2419 r'^(?:https?:)?//', base):
e34c3361
S
2420 return None
2421 return compat_urlparse.urljoin(base, path)
2422
2423
ac668111 2424class HEADRequest(urllib.request.Request):
aa94a6d3 2425 def get_method(self):
611c1dd9 2426 return 'HEAD'
7217e148
PH
2427
2428
ac668111 2429class PUTRequest(urllib.request.Request):
95cf60e8
S
2430 def get_method(self):
2431 return 'PUT'
2432
2433
9732d77e 2434def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2435 if get_attr and v is not None:
2436 v = getattr(v, get_attr, None)
1812afb7
S
2437 try:
2438 return int(v) * invscale // scale
31c49255 2439 except (ValueError, TypeError, OverflowError):
af98f8ff 2440 return default
9732d77e 2441
9572013d 2442
40a90862
JMF
2443def str_or_none(v, default=None):
2444 return default if v is None else compat_str(v)
2445
9732d77e
PH
2446
2447def str_to_int(int_str):
48d4681e 2448 """ A more relaxed version of int_or_none """
f9934b96 2449 if isinstance(int_str, int):
348c6bf1 2450 return int_str
42db58ec
S
2451 elif isinstance(int_str, compat_str):
2452 int_str = re.sub(r'[,\.\+]', '', int_str)
2453 return int_or_none(int_str)
608d11f5
PH
2454
2455
9732d77e 2456def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2457 if v is None:
2458 return default
2459 try:
2460 return float(v) * invscale / scale
5e1271c5 2461 except (ValueError, TypeError):
caf80631 2462 return default
43f775e4
PH
2463
2464
c7e327c4
S
2465def bool_or_none(v, default=None):
2466 return v if isinstance(v, bool) else default
2467
2468
53cd37ba
S
2469def strip_or_none(v, default=None):
2470 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2471
2472
af03000a
S
2473def url_or_none(url):
2474 if not url or not isinstance(url, compat_str):
2475 return None
2476 url = url.strip()
29f7c58a 2477 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2478
2479
3e9b66d7 2480def request_to_url(req):
ac668111 2481 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2482 return req.get_full_url()
2483 else:
2484 return req
2485
2486
e29663c6 2487def strftime_or_none(timestamp, date_format, default=None):
2488 datetime_object = None
2489 try:
f9934b96 2490 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2491 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2492 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2493 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2494 return datetime_object.strftime(date_format)
2495 except (ValueError, TypeError, AttributeError):
2496 return default
2497
2498
608d11f5 2499def parse_duration(s):
f9934b96 2500 if not isinstance(s, str):
608d11f5 2501 return None
ca7b3246 2502 s = s.strip()
38d79fd1 2503 if not s:
2504 return None
ca7b3246 2505
acaff495 2506 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2507 m = re.match(r'''(?x)
2508 (?P<before_secs>
2509 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2510 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2511 (?P<ms>[.:][0-9]+)?Z?$
2512 ''', s)
acaff495 2513 if m:
8bd1c00b 2514 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2515 else:
2516 m = re.match(
056653bb
S
2517 r'''(?ix)(?:P?
2518 (?:
1c1b2f96 2519 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2520 )?
2521 (?:
1c1b2f96 2522 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2523 )?
2524 (?:
1c1b2f96 2525 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2526 )?
8f4b58d7 2527 (?:
1c1b2f96 2528 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2529 )?
056653bb 2530 T)?
acaff495 2531 (?:
1c1b2f96 2532 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2533 )?
2534 (?:
1c1b2f96 2535 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2536 )?
2537 (?:
2538 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2539 )?Z?$''', s)
acaff495 2540 if m:
2541 days, hours, mins, secs, ms = m.groups()
2542 else:
15846398 2543 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2544 if m:
2545 hours, mins = m.groups()
2546 else:
2547 return None
2548
acaff495 2549 if ms:
19a03940 2550 ms = ms.replace(':', '.')
2551 return sum(float(part or 0) * mult for part, mult in (
2552 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2553
2554
e65e4c88 2555def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2556 name, real_ext = os.path.splitext(filename)
e65e4c88 2557 return (
86e5f3ed 2558 f'{name}.{ext}{real_ext}'
e65e4c88 2559 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2560 else f'{filename}.{ext}')
d70ad093
PH
2561
2562
b3ed15b7
S
2563def replace_extension(filename, ext, expected_real_ext=None):
2564 name, real_ext = os.path.splitext(filename)
86e5f3ed 2565 return '{}.{}'.format(
b3ed15b7
S
2566 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2567 ext)
2568
2569
d70ad093
PH
2570def check_executable(exe, args=[]):
2571 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2572 args can be a list of arguments for a short output (like -version) """
2573 try:
f0c9fb96 2574 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2575 except OSError:
2576 return False
2577 return exe
b7ab0590
PH
2578
2579
8a7f68d0 2580def _get_exe_version_output(exe, args, *, to_screen=None):
2581 if to_screen:
2582 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2583 try:
b64d04c1 2584 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2585 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2586 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2587 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2588 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2589 except OSError:
2590 return False
f0c9fb96 2591 return stdout
cae97f65
PH
2592
2593
2594def detect_exe_version(output, version_re=None, unrecognized='present'):
2595 assert isinstance(output, compat_str)
2596 if version_re is None:
2597 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2598 m = re.search(version_re, output)
95807118
PH
2599 if m:
2600 return m.group(1)
2601 else:
2602 return unrecognized
2603
2604
9af98e17 2605def get_exe_version(exe, args=['--version'],
2606 version_re=None, unrecognized='present'):
2607 """ Returns the version of the specified executable,
2608 or False if the executable is not present """
2609 out = _get_exe_version_output(exe, args)
2610 return detect_exe_version(out, version_re, unrecognized) if out else False
2611
2612
7e88d7d7 2613def frange(start=0, stop=None, step=1):
2614 """Float range"""
2615 if stop is None:
2616 start, stop = 0, start
2617 sign = [-1, 1][step > 0] if step else 0
2618 while sign * start < sign * stop:
2619 yield start
2620 start += step
2621
2622
cb89cfc1 2623class LazyList(collections.abc.Sequence):
0f06bcd7 2624 """Lazy immutable list from an iterable
2625 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2626
8e5fecc8 2627 class IndexError(IndexError):
2628 pass
2629
282f5709 2630 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2631 self._iterable = iter(iterable)
2632 self._cache = [] if _cache is None else _cache
2633 self._reversed = reverse
483336e7 2634
2635 def __iter__(self):
0f06bcd7 2636 if self._reversed:
28419ca2 2637 # We need to consume the entire iterable to iterate in reverse
981052c9 2638 yield from self.exhaust()
28419ca2 2639 return
0f06bcd7 2640 yield from self._cache
2641 for item in self._iterable:
2642 self._cache.append(item)
483336e7 2643 yield item
2644
0f06bcd7 2645 def _exhaust(self):
2646 self._cache.extend(self._iterable)
2647 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2648 return self._cache
28419ca2 2649
981052c9 2650 def exhaust(self):
0f06bcd7 2651 """Evaluate the entire iterable"""
2652 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2653
28419ca2 2654 @staticmethod
0f06bcd7 2655 def _reverse_index(x):
e0f2b4b4 2656 return None if x is None else -(x + 1)
483336e7 2657
2658 def __getitem__(self, idx):
2659 if isinstance(idx, slice):
0f06bcd7 2660 if self._reversed:
2661 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2662 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2663 elif isinstance(idx, int):
0f06bcd7 2664 if self._reversed:
2665 idx = self._reverse_index(idx)
e0f2b4b4 2666 start, stop, step = idx, idx, 0
483336e7 2667 else:
2668 raise TypeError('indices must be integers or slices')
e0f2b4b4 2669 if ((start or 0) < 0 or (stop or 0) < 0
2670 or (start is None and step < 0)
2671 or (stop is None and step > 0)):
483336e7 2672 # We need to consume the entire iterable to be able to slice from the end
2673 # Obviously, never use this with infinite iterables
0f06bcd7 2674 self._exhaust()
8e5fecc8 2675 try:
0f06bcd7 2676 return self._cache[idx]
8e5fecc8 2677 except IndexError as e:
2678 raise self.IndexError(e) from e
0f06bcd7 2679 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2680 if n > 0:
0f06bcd7 2681 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2682 try:
0f06bcd7 2683 return self._cache[idx]
8e5fecc8 2684 except IndexError as e:
2685 raise self.IndexError(e) from e
483336e7 2686
2687 def __bool__(self):
2688 try:
0f06bcd7 2689 self[-1] if self._reversed else self[0]
8e5fecc8 2690 except self.IndexError:
483336e7 2691 return False
2692 return True
2693
2694 def __len__(self):
0f06bcd7 2695 self._exhaust()
2696 return len(self._cache)
483336e7 2697
282f5709 2698 def __reversed__(self):
0f06bcd7 2699 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2700
2701 def __copy__(self):
0f06bcd7 2702 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2703
28419ca2 2704 def __repr__(self):
2705 # repr and str should mimic a list. So we exhaust the iterable
2706 return repr(self.exhaust())
2707
2708 def __str__(self):
2709 return repr(self.exhaust())
2710
483336e7 2711
7be9ccff 2712class PagedList:
c07a39ae 2713
2714 class IndexError(IndexError):
2715 pass
2716
dd26ced1
PH
2717 def __len__(self):
2718 # This is only useful for tests
2719 return len(self.getslice())
2720
7be9ccff 2721 def __init__(self, pagefunc, pagesize, use_cache=True):
2722 self._pagefunc = pagefunc
2723 self._pagesize = pagesize
f1d13090 2724 self._pagecount = float('inf')
7be9ccff 2725 self._use_cache = use_cache
2726 self._cache = {}
2727
2728 def getpage(self, pagenum):
d8cf8d97 2729 page_results = self._cache.get(pagenum)
2730 if page_results is None:
f1d13090 2731 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2732 if self._use_cache:
2733 self._cache[pagenum] = page_results
2734 return page_results
2735
2736 def getslice(self, start=0, end=None):
2737 return list(self._getslice(start, end))
2738
2739 def _getslice(self, start, end):
55575225 2740 raise NotImplementedError('This method must be implemented by subclasses')
2741
2742 def __getitem__(self, idx):
f1d13090 2743 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2744 if not isinstance(idx, int) or idx < 0:
2745 raise TypeError('indices must be non-negative integers')
2746 entries = self.getslice(idx, idx + 1)
d8cf8d97 2747 if not entries:
c07a39ae 2748 raise self.IndexError()
d8cf8d97 2749 return entries[0]
55575225 2750
9c44d242
PH
2751
2752class OnDemandPagedList(PagedList):
a44ca5a4 2753 """Download pages until a page with less than maximum results"""
86e5f3ed 2754
7be9ccff 2755 def _getslice(self, start, end):
b7ab0590
PH
2756 for pagenum in itertools.count(start // self._pagesize):
2757 firstid = pagenum * self._pagesize
2758 nextfirstid = pagenum * self._pagesize + self._pagesize
2759 if start >= nextfirstid:
2760 continue
2761
b7ab0590
PH
2762 startv = (
2763 start % self._pagesize
2764 if firstid <= start < nextfirstid
2765 else 0)
b7ab0590
PH
2766 endv = (
2767 ((end - 1) % self._pagesize) + 1
2768 if (end is not None and firstid <= end <= nextfirstid)
2769 else None)
2770
f1d13090 2771 try:
2772 page_results = self.getpage(pagenum)
2773 except Exception:
2774 self._pagecount = pagenum - 1
2775 raise
b7ab0590
PH
2776 if startv != 0 or endv is not None:
2777 page_results = page_results[startv:endv]
7be9ccff 2778 yield from page_results
b7ab0590
PH
2779
2780 # A little optimization - if current page is not "full", ie. does
2781 # not contain page_size videos then we can assume that this page
2782 # is the last one - there are no more ids on further pages -
2783 # i.e. no need to query again.
2784 if len(page_results) + startv < self._pagesize:
2785 break
2786
2787 # If we got the whole page, but the next page is not interesting,
2788 # break out early as well
2789 if end == nextfirstid:
2790 break
81c2f20b
PH
2791
2792
9c44d242 2793class InAdvancePagedList(PagedList):
a44ca5a4 2794 """PagedList with total number of pages known in advance"""
86e5f3ed 2795
9c44d242 2796 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2797 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2798 self._pagecount = pagecount
9c44d242 2799
7be9ccff 2800 def _getslice(self, start, end):
9c44d242 2801 start_page = start // self._pagesize
d37707bd 2802 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2803 skip_elems = start - start_page * self._pagesize
2804 only_more = None if end is None else end - start
2805 for pagenum in range(start_page, end_page):
7be9ccff 2806 page_results = self.getpage(pagenum)
9c44d242 2807 if skip_elems:
7be9ccff 2808 page_results = page_results[skip_elems:]
9c44d242
PH
2809 skip_elems = None
2810 if only_more is not None:
7be9ccff 2811 if len(page_results) < only_more:
2812 only_more -= len(page_results)
9c44d242 2813 else:
7be9ccff 2814 yield from page_results[:only_more]
9c44d242 2815 break
7be9ccff 2816 yield from page_results
9c44d242
PH
2817
2818
7e88d7d7 2819class PlaylistEntries:
2820 MissingEntry = object()
2821 is_exhausted = False
2822
2823 def __init__(self, ydl, info_dict):
7e9a6125 2824 self.ydl = ydl
2825
2826 # _entries must be assigned now since infodict can change during iteration
2827 entries = info_dict.get('entries')
2828 if entries is None:
2829 raise EntryNotInPlaylist('There are no entries')
2830 elif isinstance(entries, list):
2831 self.is_exhausted = True
2832
2833 requested_entries = info_dict.get('requested_entries')
2834 self.is_incomplete = bool(requested_entries)
2835 if self.is_incomplete:
2836 assert self.is_exhausted
2837 self._entries = [self.MissingEntry] * max(requested_entries)
2838 for i, entry in zip(requested_entries, entries):
2839 self._entries[i - 1] = entry
2840 elif isinstance(entries, (list, PagedList, LazyList)):
2841 self._entries = entries
2842 else:
2843 self._entries = LazyList(entries)
7e88d7d7 2844
2845 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2846 (?P<start>[+-]?\d+)?
2847 (?P<range>[:-]
2848 (?P<end>[+-]?\d+|inf(?:inite)?)?
2849 (?::(?P<step>[+-]?\d+))?
2850 )?''')
2851
2852 @classmethod
2853 def parse_playlist_items(cls, string):
2854 for segment in string.split(','):
2855 if not segment:
2856 raise ValueError('There is two or more consecutive commas')
2857 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2858 if not mobj:
2859 raise ValueError(f'{segment!r} is not a valid specification')
2860 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2861 if int_or_none(step) == 0:
2862 raise ValueError(f'Step in {segment!r} cannot be zero')
2863 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2864
2865 def get_requested_items(self):
2866 playlist_items = self.ydl.params.get('playlist_items')
2867 playlist_start = self.ydl.params.get('playliststart', 1)
2868 playlist_end = self.ydl.params.get('playlistend')
2869 # For backwards compatibility, interpret -1 as whole list
2870 if playlist_end in (-1, None):
2871 playlist_end = ''
2872 if not playlist_items:
2873 playlist_items = f'{playlist_start}:{playlist_end}'
2874 elif playlist_start != 1 or playlist_end:
2875 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2876
2877 for index in self.parse_playlist_items(playlist_items):
2878 for i, entry in self[index]:
2879 yield i, entry
1ac4fd80 2880 if not entry:
2881 continue
7e88d7d7 2882 try:
2883 # TODO: Add auto-generated fields
2884 self.ydl._match_entry(entry, incomplete=True, silent=True)
2885 except (ExistingVideoReached, RejectedVideoReached):
2886 return
2887
7e9a6125 2888 def get_full_count(self):
2889 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2890 return len(self)
2891 elif isinstance(self._entries, InAdvancePagedList):
2892 if self._entries._pagesize == 1:
2893 return self._entries._pagecount
2894
7e88d7d7 2895 @functools.cached_property
2896 def _getter(self):
2897 if isinstance(self._entries, list):
2898 def get_entry(i):
2899 try:
2900 entry = self._entries[i]
2901 except IndexError:
2902 entry = self.MissingEntry
2903 if not self.is_incomplete:
2904 raise self.IndexError()
2905 if entry is self.MissingEntry:
2906 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2907 return entry
2908 else:
2909 def get_entry(i):
2910 try:
2911 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2912 except (LazyList.IndexError, PagedList.IndexError):
2913 raise self.IndexError()
2914 return get_entry
2915
2916 def __getitem__(self, idx):
2917 if isinstance(idx, int):
2918 idx = slice(idx, idx)
2919
2920 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2921 step = 1 if idx.step is None else idx.step
2922 if idx.start is None:
2923 start = 0 if step > 0 else len(self) - 1
2924 else:
2925 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2926
2927 # NB: Do not call len(self) when idx == [:]
2928 if idx.stop is None:
2929 stop = 0 if step < 0 else float('inf')
2930 else:
2931 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2932 stop += [-1, 1][step > 0]
2933
2934 for i in frange(start, stop, step):
2935 if i < 0:
2936 continue
2937 try:
7e9a6125 2938 entry = self._getter(i)
2939 except self.IndexError:
2940 self.is_exhausted = True
2941 if step > 0:
7e88d7d7 2942 break
7e9a6125 2943 continue
7e88d7d7 2944 yield i + 1, entry
2945
2946 def __len__(self):
2947 return len(tuple(self[:]))
2948
2949 class IndexError(IndexError):
2950 pass
2951
2952
81c2f20b 2953def uppercase_escape(s):
676eb3f2 2954 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2955 return re.sub(
a612753d 2956 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2957 lambda m: unicode_escape(m.group(0))[0],
2958 s)
0fe2ff78
YCH
2959
2960
2961def lowercase_escape(s):
2962 unicode_escape = codecs.getdecoder('unicode_escape')
2963 return re.sub(
2964 r'\\u[0-9a-fA-F]{4}',
2965 lambda m: unicode_escape(m.group(0))[0],
2966 s)
b53466e1 2967
d05cfe06
S
2968
2969def escape_rfc3986(s):
2970 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2971 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2972
2973
2974def escape_url(url):
2975 """Escape URL as suggested by RFC 3986"""
2976 url_parsed = compat_urllib_parse_urlparse(url)
2977 return url_parsed._replace(
efbed08d 2978 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2979 path=escape_rfc3986(url_parsed.path),
2980 params=escape_rfc3986(url_parsed.params),
2981 query=escape_rfc3986(url_parsed.query),
2982 fragment=escape_rfc3986(url_parsed.fragment)
2983 ).geturl()
2984
62e609ab 2985
4dfbf869 2986def parse_qs(url):
2987 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2988
2989
62e609ab
PH
2990def read_batch_urls(batch_fd):
2991 def fixup(url):
2992 if not isinstance(url, compat_str):
2993 url = url.decode('utf-8', 'replace')
8c04f0be 2994 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2995 for bom in BOM_UTF8:
2996 if url.startswith(bom):
2997 url = url[len(bom):]
2998 url = url.lstrip()
2999 if not url or url.startswith(('#', ';', ']')):
62e609ab 3000 return False
8c04f0be 3001 # "#" cannot be stripped out since it is part of the URI
3002 # However, it can be safely stipped out if follwing a whitespace
3003 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3004
3005 with contextlib.closing(batch_fd) as fd:
3006 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3007
3008
3009def urlencode_postdata(*args, **kargs):
15707c7e 3010 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3011
3012
38f9ef31 3013def update_url_query(url, query):
cacd9966
YCH
3014 if not query:
3015 return url
38f9ef31 3016 parsed_url = compat_urlparse.urlparse(url)
3017 qs = compat_parse_qs(parsed_url.query)
3018 qs.update(query)
3019 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 3020 query=compat_urllib_parse_urlencode(qs, True)))
16392824 3021
8e60dc75 3022
ed0291d1
S
3023def update_Request(req, url=None, data=None, headers={}, query={}):
3024 req_headers = req.headers.copy()
3025 req_headers.update(headers)
3026 req_data = data or req.data
3027 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3028 req_get_method = req.get_method()
3029 if req_get_method == 'HEAD':
3030 req_type = HEADRequest
3031 elif req_get_method == 'PUT':
3032 req_type = PUTRequest
3033 else:
ac668111 3034 req_type = urllib.request.Request
ed0291d1
S
3035 new_req = req_type(
3036 req_url, data=req_data, headers=req_headers,
3037 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3038 if hasattr(req, 'timeout'):
3039 new_req.timeout = req.timeout
3040 return new_req
3041
3042
10c87c15 3043def _multipart_encode_impl(data, boundary):
0c265486
YCH
3044 content_type = 'multipart/form-data; boundary=%s' % boundary
3045
3046 out = b''
3047 for k, v in data.items():
3048 out += b'--' + boundary.encode('ascii') + b'\r\n'
3049 if isinstance(k, compat_str):
0f06bcd7 3050 k = k.encode()
0c265486 3051 if isinstance(v, compat_str):
0f06bcd7 3052 v = v.encode()
0c265486
YCH
3053 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3054 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3055 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3056 if boundary.encode('ascii') in content:
3057 raise ValueError('Boundary overlaps with data')
3058 out += content
3059
3060 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3061
3062 return out, content_type
3063
3064
3065def multipart_encode(data, boundary=None):
3066 '''
3067 Encode a dict to RFC 7578-compliant form-data
3068
3069 data:
3070 A dict where keys and values can be either Unicode or bytes-like
3071 objects.
3072 boundary:
3073 If specified a Unicode object, it's used as the boundary. Otherwise
3074 a random boundary is generated.
3075
3076 Reference: https://tools.ietf.org/html/rfc7578
3077 '''
3078 has_specified_boundary = boundary is not None
3079
3080 while True:
3081 if boundary is None:
3082 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3083
3084 try:
10c87c15 3085 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3086 break
3087 except ValueError:
3088 if has_specified_boundary:
3089 raise
3090 boundary = None
3091
3092 return out, content_type
3093
3094
86296ad2 3095def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3096 for val in map(d.get, variadic(key_or_keys)):
3097 if val is not None and (val or not skip_false_values):
3098 return val
3099 return default
cbecc9b9
S
3100
3101
c4f60dd7 3102def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3103 for f in funcs:
a32a9a7e 3104 try:
c4f60dd7 3105 val = f(*args, **kwargs)
3106 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
3107 pass
3108 else:
c4f60dd7 3109 if expected_type is None or isinstance(val, expected_type):
3110 return val
3111
3112
3113def try_get(src, getter, expected_type=None):
3114 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3115
3116
90137ca4 3117def filter_dict(dct, cndn=lambda _, v: v is not None):
3118 return {k: v for k, v in dct.items() if cndn(k, v)}
3119
3120
6cc62232
S
3121def merge_dicts(*dicts):
3122 merged = {}
3123 for a_dict in dicts:
3124 for k, v in a_dict.items():
90137ca4 3125 if (v is not None and k not in merged
3126 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3127 merged[k] = v
3128 return merged
3129
3130
8e60dc75
S
3131def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3132 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3133
16392824 3134
a1a530b0
PH
3135US_RATINGS = {
3136 'G': 0,
3137 'PG': 10,
3138 'PG-13': 13,
3139 'R': 16,
3140 'NC': 18,
3141}
fac55558
PH
3142
3143
a8795327 3144TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3145 'TV-Y': 0,
3146 'TV-Y7': 7,
3147 'TV-G': 0,
3148 'TV-PG': 0,
3149 'TV-14': 14,
3150 'TV-MA': 17,
a8795327
S
3151}
3152
3153
146c80e2 3154def parse_age_limit(s):
19a03940 3155 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3156 if type(s) is int: # noqa: E721
a8795327 3157 return s if 0 <= s <= 21 else None
19a03940 3158 elif not isinstance(s, str):
d838b1bd 3159 return None
146c80e2 3160 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3161 if m:
3162 return int(m.group('age'))
5c5fae6d 3163 s = s.upper()
a8795327
S
3164 if s in US_RATINGS:
3165 return US_RATINGS[s]
5a16c9d9 3166 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3167 if m:
5a16c9d9 3168 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3169 return None
146c80e2
S
3170
3171
fac55558 3172def strip_jsonp(code):
609a61e3 3173 return re.sub(
5552c9eb 3174 r'''(?sx)^
e9c671d5 3175 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3176 (?:\s*&&\s*(?P=func_name))?
3177 \s*\(\s*(?P<callback_data>.*)\);?
3178 \s*?(?://[^\n]*)*$''',
3179 r'\g<callback_data>', code)
478c2c61
PH
3180
3181
5c610515 3182def js_to_json(code, vars={}):
3183 # vars is a dict of var, val pairs to substitute
c843e685 3184 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3185 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3186 INTEGER_TABLE = (
86e5f3ed 3187 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3188 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3189 )
3190
e05f6939 3191 def fix_kv(m):
e7b6d122
PH
3192 v = m.group(0)
3193 if v in ('true', 'false', 'null'):
3194 return v
421ddcb8
C
3195 elif v in ('undefined', 'void 0'):
3196 return 'null'
8bdd16b4 3197 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3198 return ""
3199
3200 if v[0] in ("'", '"'):
3201 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3202 '"': '\\"',
bd1e4844 3203 "\\'": "'",
3204 '\\\n': '',
3205 '\\x': '\\u00',
3206 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3207 else:
3208 for regex, base in INTEGER_TABLE:
3209 im = re.match(regex, v)
3210 if im:
3211 i = int(im.group(1), base)
3212 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3213
5c610515 3214 if v in vars:
3215 return vars[v]
3216
e7b6d122 3217 return '"%s"' % v
e05f6939 3218
8072ef2b 3219 def create_map(mobj):
3220 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3221
febff4c1 3222 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
8072ef2b 3223 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
febff4c1 3224
bd1e4844 3225 return re.sub(r'''(?sx)
3226 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3227 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3228 {comment}|,(?={skip}[\]}}])|
421ddcb8 3229 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3230 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3231 [0-9]+(?={skip}:)|
3232 !+
4195096e 3233 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3234
3235
478c2c61
PH
3236def qualities(quality_ids):
3237 """ Get a numeric quality value out of a list of possible values """
3238 def q(qid):
3239 try:
3240 return quality_ids.index(qid)
3241 except ValueError:
3242 return -1
3243 return q
3244
acd69589 3245
8aa0e7cd 3246POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3247
3248
de6000d9 3249DEFAULT_OUTTMPL = {
3250 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3251 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3252}
3253OUTTMPL_TYPES = {
72755351 3254 'chapter': None,
de6000d9 3255 'subtitle': None,
3256 'thumbnail': None,
3257 'description': 'description',
3258 'annotation': 'annotations.xml',
3259 'infojson': 'info.json',
08438d2c 3260 'link': None,
3b603dbd 3261 'pl_video': None,
5112f26a 3262 'pl_thumbnail': None,
de6000d9 3263 'pl_description': 'description',
3264 'pl_infojson': 'info.json',
3265}
0a871f68 3266
143db31d 3267# As of [1] format syntax is:
3268# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3269# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3270STR_FORMAT_RE_TMPL = r'''(?x)
3271 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3272 %
524e2e4f 3273 (?P<has_key>\((?P<key>{0})\))?
752cda38 3274 (?P<format>
524e2e4f 3275 (?P<conversion>[#0\-+ ]+)?
3276 (?P<min_width>\d+)?
3277 (?P<precision>\.\d+)?
3278 (?P<len_mod>[hlL])? # unused in python
901130bb 3279 {1} # conversion type
752cda38 3280 )
143db31d 3281'''
3282
7d1eb38a 3283
901130bb 3284STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3285
7d1eb38a 3286
a020a0dc
PH
3287def limit_length(s, length):
3288 """ Add ellipses to overly long strings """
3289 if s is None:
3290 return None
3291 ELLIPSES = '...'
3292 if len(s) > length:
3293 return s[:length - len(ELLIPSES)] + ELLIPSES
3294 return s
48844745
PH
3295
3296
3297def version_tuple(v):
5f9b8394 3298 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3299
3300
3301def is_outdated_version(version, limit, assume_new=True):
3302 if not version:
3303 return not assume_new
3304 try:
3305 return version_tuple(version) < version_tuple(limit)
3306 except ValueError:
3307 return not assume_new
732ea2f0
PH
3308
3309
3310def ytdl_is_updateable():
7a5c1cfe 3311 """ Returns if yt-dlp can be updated with -U """
735d865e 3312
5d535b4a 3313 from .update import is_non_updateable
732ea2f0 3314
5d535b4a 3315 return not is_non_updateable()
7d4111ed
PH
3316
3317
3318def args_to_str(args):
3319 # Get a short string representation for a subprocess command
702ccf2d 3320 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3321
3322
9b9c5355 3323def error_to_compat_str(err):
cfb0511d 3324 return str(err)
fdae2358
S
3325
3326
a44ca5a4 3327def error_to_str(err):
3328 return f'{type(err).__name__}: {err}'
3329
3330
c460bdd5 3331def mimetype2ext(mt):
eb9ee194
S
3332 if mt is None:
3333 return None
3334
9359f3d4
F
3335 mt, _, params = mt.partition(';')
3336 mt = mt.strip()
3337
3338 FULL_MAP = {
765ac263 3339 'audio/mp4': 'm4a',
6c33d24b
YCH
3340 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3341 # it's the most popular one
3342 'audio/mpeg': 'mp3',
ba39289d 3343 'audio/x-wav': 'wav',
9359f3d4
F
3344 'audio/wav': 'wav',
3345 'audio/wave': 'wav',
3346 }
3347
3348 ext = FULL_MAP.get(mt)
765ac263
JMF
3349 if ext is not None:
3350 return ext
3351
9359f3d4 3352 SUBTYPE_MAP = {
f6861ec9 3353 '3gpp': '3gp',
cafcf657 3354 'smptett+xml': 'tt',
cafcf657 3355 'ttaf+xml': 'dfxp',
a0d8d704 3356 'ttml+xml': 'ttml',
f6861ec9 3357 'x-flv': 'flv',
a0d8d704 3358 'x-mp4-fragmented': 'mp4',
d4f05d47 3359 'x-ms-sami': 'sami',
a0d8d704 3360 'x-ms-wmv': 'wmv',
b4173f15
RA
3361 'mpegurl': 'm3u8',
3362 'x-mpegurl': 'm3u8',
3363 'vnd.apple.mpegurl': 'm3u8',
3364 'dash+xml': 'mpd',
b4173f15 3365 'f4m+xml': 'f4m',
f164b971 3366 'hds+xml': 'f4m',
e910fe2f 3367 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3368 'quicktime': 'mov',
98ce1a3f 3369 'mp2t': 'ts',
39e7107d 3370 'x-wav': 'wav',
9359f3d4
F
3371 'filmstrip+json': 'fs',
3372 'svg+xml': 'svg',
3373 }
3374
3375 _, _, subtype = mt.rpartition('/')
3376 ext = SUBTYPE_MAP.get(subtype.lower())
3377 if ext is not None:
3378 return ext
3379
3380 SUFFIX_MAP = {
3381 'json': 'json',
3382 'xml': 'xml',
3383 'zip': 'zip',
3384 'gzip': 'gz',
3385 }
3386
3387 _, _, suffix = subtype.partition('+')
3388 ext = SUFFIX_MAP.get(suffix)
3389 if ext is not None:
3390 return ext
3391
3392 return subtype.replace('+', '.')
c460bdd5
PH
3393
3394
2814f12b
THD
3395def ext2mimetype(ext_or_url):
3396 if not ext_or_url:
3397 return None
3398 if '.' not in ext_or_url:
3399 ext_or_url = f'file.{ext_or_url}'
3400 return mimetypes.guess_type(ext_or_url)[0]
3401
3402
4f3c5e06 3403def parse_codecs(codecs_str):
3404 # http://tools.ietf.org/html/rfc6381
3405 if not codecs_str:
3406 return {}
a0566bbf 3407 split_codecs = list(filter(None, map(
dbf5416a 3408 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3409 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3410 for full_codec in split_codecs:
9bd979ca 3411 parts = full_codec.split('.')
3412 codec = parts[0].replace('0', '')
3413 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3414 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3415 if not vcodec:
b69fd25c 3416 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3417 if codec in ('dvh1', 'dvhe'):
3418 hdr = 'DV'
9bd979ca 3419 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3420 hdr = 'HDR10'
3421 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3422 hdr = 'HDR10'
b69fd25c 3423 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3424 if not acodec:
3425 acodec = full_codec
4afa3ec4 3426 elif codec in ('stpp', 'wvtt',):
3fe75fdc 3427 if not scodec:
3428 scodec = full_codec
4f3c5e06 3429 else:
19a03940 3430 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3431 if vcodec or acodec or scodec:
4f3c5e06 3432 return {
3433 'vcodec': vcodec or 'none',
3434 'acodec': acodec or 'none',
176f1866 3435 'dynamic_range': hdr,
3fe75fdc 3436 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3437 }
b69fd25c 3438 elif len(split_codecs) == 2:
3439 return {
3440 'vcodec': split_codecs[0],
3441 'acodec': split_codecs[1],
3442 }
4f3c5e06 3443 return {}
3444
3445
2ccd1b10 3446def urlhandle_detect_ext(url_handle):
79298173 3447 getheader = url_handle.headers.get
2ccd1b10 3448
b55ee18f
PH
3449 cd = getheader('Content-Disposition')
3450 if cd:
3451 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3452 if m:
3453 e = determine_ext(m.group('filename'), default_ext=None)
3454 if e:
3455 return e
3456
c460bdd5 3457 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3458
3459
1e399778
YCH
3460def encode_data_uri(data, mime_type):
3461 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3462
3463
05900629 3464def age_restricted(content_limit, age_limit):
6ec6cb4e 3465 """ Returns True iff the content should be blocked """
05900629
PH
3466
3467 if age_limit is None: # No limit set
3468 return False
3469 if content_limit is None:
3470 return False # Content available for everyone
3471 return age_limit < content_limit
61ca9a80
PH
3472
3473
3474def is_html(first_bytes):
3475 """ Detect whether a file contains HTML by examining its first bytes. """
3476
3477 BOMS = [
3478 (b'\xef\xbb\xbf', 'utf-8'),
3479 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3480 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3481 (b'\xff\xfe', 'utf-16-le'),
3482 (b'\xfe\xff', 'utf-16-be'),
3483 ]
80e8493e 3484
3485 encoding = 'utf-8'
61ca9a80 3486 for bom, enc in BOMS:
80e8493e 3487 while first_bytes.startswith(bom):
3488 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3489
80e8493e 3490 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3491
3492
3493def determine_protocol(info_dict):
3494 protocol = info_dict.get('protocol')
3495 if protocol is not None:
3496 return protocol
3497
7de837a5 3498 url = sanitize_url(info_dict['url'])
a055469f
PH
3499 if url.startswith('rtmp'):
3500 return 'rtmp'
3501 elif url.startswith('mms'):
3502 return 'mms'
3503 elif url.startswith('rtsp'):
3504 return 'rtsp'
3505
3506 ext = determine_ext(url)
3507 if ext == 'm3u8':
3508 return 'm3u8'
3509 elif ext == 'f4m':
3510 return 'f4m'
3511
3512 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3513
3514
c5e3f849 3515def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3516 """ Render a list of rows, each as a list of values.
3517 Text after a \t will be right aligned """
ec11a9f4 3518 def width(string):
c5e3f849 3519 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3520
3521 def get_max_lens(table):
ec11a9f4 3522 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3523
3524 def filter_using_list(row, filterArray):
d16df59d 3525 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3526
d16df59d 3527 max_lens = get_max_lens(data) if hide_empty else []
3528 header_row = filter_using_list(header_row, max_lens)
3529 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3530
cfb56d1a 3531 table = [header_row] + data
76d321f6 3532 max_lens = get_max_lens(table)
c5e3f849 3533 extra_gap += 1
76d321f6 3534 if delim:
c5e3f849 3535 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3536 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3537 for row in table:
3538 for pos, text in enumerate(map(str, row)):
c5e3f849 3539 if '\t' in text:
3540 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3541 else:
3542 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3543 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3544 return ret
347de493
PH
3545
3546
8f18aca8 3547def _match_one(filter_part, dct, incomplete):
77b87f05 3548 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3549 STRING_OPERATORS = {
3550 '*=': operator.contains,
3551 '^=': lambda attr, value: attr.startswith(value),
3552 '$=': lambda attr, value: attr.endswith(value),
3553 '~=': lambda attr, value: re.search(value, attr),
3554 }
347de493 3555 COMPARISON_OPERATORS = {
a047eeb6 3556 **STRING_OPERATORS,
3557 '<=': operator.le, # "<=" must be defined above "<"
347de493 3558 '<': operator.lt,
347de493 3559 '>=': operator.ge,
a047eeb6 3560 '>': operator.gt,
347de493 3561 '=': operator.eq,
347de493 3562 }
a047eeb6 3563
6db9c4d5 3564 if isinstance(incomplete, bool):
3565 is_incomplete = lambda _: incomplete
3566 else:
3567 is_incomplete = lambda k: k in incomplete
3568
64fa820c 3569 operator_rex = re.compile(r'''(?x)
347de493 3570 (?P<key>[a-z_]+)
77b87f05 3571 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3572 (?:
a047eeb6 3573 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3574 (?P<strval>.+?)
347de493 3575 )
347de493 3576 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3577 m = operator_rex.fullmatch(filter_part.strip())
347de493 3578 if m:
18f96d12 3579 m = m.groupdict()
3580 unnegated_op = COMPARISON_OPERATORS[m['op']]
3581 if m['negation']:
77b87f05
MT
3582 op = lambda attr, value: not unnegated_op(attr, value)
3583 else:
3584 op = unnegated_op
18f96d12 3585 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3586 if m['quote']:
3587 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3588 actual_value = dct.get(m['key'])
3589 numeric_comparison = None
f9934b96 3590 if isinstance(actual_value, (int, float)):
e5a088dc
S
3591 # If the original field is a string and matching comparisonvalue is
3592 # a number we should respect the origin of the original field
3593 # and process comparison value as a string (see
18f96d12 3594 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3595 try:
18f96d12 3596 numeric_comparison = int(comparison_value)
347de493 3597 except ValueError:
18f96d12 3598 numeric_comparison = parse_filesize(comparison_value)
3599 if numeric_comparison is None:
3600 numeric_comparison = parse_filesize(f'{comparison_value}B')
3601 if numeric_comparison is None:
3602 numeric_comparison = parse_duration(comparison_value)
3603 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3604 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3605 if actual_value is None:
6db9c4d5 3606 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3607 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3608
3609 UNARY_OPERATORS = {
1cc47c66
S
3610 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3611 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3612 }
64fa820c 3613 operator_rex = re.compile(r'''(?x)
347de493 3614 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3615 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3616 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3617 if m:
3618 op = UNARY_OPERATORS[m.group('op')]
3619 actual_value = dct.get(m.group('key'))
6db9c4d5 3620 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3621 return True
347de493
PH
3622 return op(actual_value)
3623
3624 raise ValueError('Invalid filter part %r' % filter_part)
3625
3626
8f18aca8 3627def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3628 """ Filter a dictionary with a simple string syntax.
3629 @returns Whether the filter passes
3630 @param incomplete Set of keys that is expected to be missing from dct.
3631 Can be True/False to indicate all/none of the keys may be missing.
3632 All conditions on incomplete keys pass if the key is missing
8f18aca8 3633 """
347de493 3634 return all(
8f18aca8 3635 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3636 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3637
3638
b1a7cd05 3639def match_filter_func(filters):
3640 if not filters:
d1b5f70b 3641 return None
492272fe 3642 filters = set(variadic(filters))
d1b5f70b 3643
492272fe 3644 interactive = '-' in filters
3645 if interactive:
3646 filters.remove('-')
3647
3648 def _match_func(info_dict, incomplete=False):
3649 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3650 return NO_DEFAULT if interactive and not incomplete else None
347de493 3651 else:
b1a7cd05 3652 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3653 filter_str = ') | ('.join(map(str.strip, filters))
3654 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3655 return _match_func
91410c9b
PH
3656
3657
5ec1b6b7 3658def download_range_func(chapters, ranges):
3659 def inner(info_dict, ydl):
3660 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3661 else 'Cannot match chapters since chapter information is unavailable')
5ec1b6b7 3662 for regex in chapters or []:
3663 for i, chapter in enumerate(info_dict.get('chapters') or []):
3664 if re.search(regex, chapter['title']):
3665 warning = None
3666 yield {**chapter, 'index': i}
56ba69e4 3667 if chapters and warning:
5ec1b6b7 3668 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3669
3670 yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3671
3672 return inner
3673
3674
bf6427d2
YCH
3675def parse_dfxp_time_expr(time_expr):
3676 if not time_expr:
d631d5f9 3677 return
bf6427d2 3678
1d485a1a 3679 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3680 if mobj:
3681 return float(mobj.group('time_offset'))
3682
db2fe38b 3683 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3684 if mobj:
db2fe38b 3685 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3686
3687
c1c924ab 3688def srt_subtitles_timecode(seconds):
aa7785f8 3689 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3690
3691
3692def ass_subtitles_timecode(seconds):
3693 time = timetuple_from_msec(seconds * 1000)
3694 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3695
3696
3697def dfxp2srt(dfxp_data):
3869028f
YCH
3698 '''
3699 @param dfxp_data A bytes-like object containing DFXP data
3700 @returns A unicode object containing converted SRT data
3701 '''
5b995f71 3702 LEGACY_NAMESPACES = (
3869028f
YCH
3703 (b'http://www.w3.org/ns/ttml', [
3704 b'http://www.w3.org/2004/11/ttaf1',
3705 b'http://www.w3.org/2006/04/ttaf1',
3706 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3707 ]),
3869028f
YCH
3708 (b'http://www.w3.org/ns/ttml#styling', [
3709 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3710 ]),
3711 )
3712
3713 SUPPORTED_STYLING = [
3714 'color',
3715 'fontFamily',
3716 'fontSize',
3717 'fontStyle',
3718 'fontWeight',
3719 'textDecoration'
3720 ]
3721
4e335771 3722 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3723 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3724 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3725 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3726 })
bf6427d2 3727
5b995f71
RA
3728 styles = {}
3729 default_style = {}
3730
86e5f3ed 3731 class TTMLPElementParser:
5b995f71
RA
3732 _out = ''
3733 _unclosed_elements = []
3734 _applied_styles = []
bf6427d2 3735
2b14cb56 3736 def start(self, tag, attrib):
5b995f71
RA
3737 if tag in (_x('ttml:br'), 'br'):
3738 self._out += '\n'
3739 else:
3740 unclosed_elements = []
3741 style = {}
3742 element_style_id = attrib.get('style')
3743 if default_style:
3744 style.update(default_style)
3745 if element_style_id:
3746 style.update(styles.get(element_style_id, {}))
3747 for prop in SUPPORTED_STYLING:
3748 prop_val = attrib.get(_x('tts:' + prop))
3749 if prop_val:
3750 style[prop] = prop_val
3751 if style:
3752 font = ''
3753 for k, v in sorted(style.items()):
3754 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3755 continue
3756 if k == 'color':
3757 font += ' color="%s"' % v
3758 elif k == 'fontSize':
3759 font += ' size="%s"' % v
3760 elif k == 'fontFamily':
3761 font += ' face="%s"' % v
3762 elif k == 'fontWeight' and v == 'bold':
3763 self._out += '<b>'
3764 unclosed_elements.append('b')
3765 elif k == 'fontStyle' and v == 'italic':
3766 self._out += '<i>'
3767 unclosed_elements.append('i')
3768 elif k == 'textDecoration' and v == 'underline':
3769 self._out += '<u>'
3770 unclosed_elements.append('u')
3771 if font:
3772 self._out += '<font' + font + '>'
3773 unclosed_elements.append('font')
3774 applied_style = {}
3775 if self._applied_styles:
3776 applied_style.update(self._applied_styles[-1])
3777 applied_style.update(style)
3778 self._applied_styles.append(applied_style)
3779 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3780
2b14cb56 3781 def end(self, tag):
5b995f71
RA
3782 if tag not in (_x('ttml:br'), 'br'):
3783 unclosed_elements = self._unclosed_elements.pop()
3784 for element in reversed(unclosed_elements):
3785 self._out += '</%s>' % element
3786 if unclosed_elements and self._applied_styles:
3787 self._applied_styles.pop()
bf6427d2 3788
2b14cb56 3789 def data(self, data):
5b995f71 3790 self._out += data
2b14cb56 3791
3792 def close(self):
5b995f71 3793 return self._out.strip()
2b14cb56 3794
3795 def parse_node(node):
3796 target = TTMLPElementParser()
3797 parser = xml.etree.ElementTree.XMLParser(target=target)
3798 parser.feed(xml.etree.ElementTree.tostring(node))
3799 return parser.close()
bf6427d2 3800
5b995f71
RA
3801 for k, v in LEGACY_NAMESPACES:
3802 for ns in v:
3803 dfxp_data = dfxp_data.replace(ns, k)
3804
3869028f 3805 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3806 out = []
5b995f71 3807 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3808
3809 if not paras:
3810 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3811
5b995f71
RA
3812 repeat = False
3813 while True:
3814 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3815 style_id = style.get('id') or style.get(_x('xml:id'))
3816 if not style_id:
3817 continue
5b995f71
RA
3818 parent_style_id = style.get('style')
3819 if parent_style_id:
3820 if parent_style_id not in styles:
3821 repeat = True
3822 continue
3823 styles[style_id] = styles[parent_style_id].copy()
3824 for prop in SUPPORTED_STYLING:
3825 prop_val = style.get(_x('tts:' + prop))
3826 if prop_val:
3827 styles.setdefault(style_id, {})[prop] = prop_val
3828 if repeat:
3829 repeat = False
3830 else:
3831 break
3832
3833 for p in ('body', 'div'):
3834 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3835 if ele is None:
3836 continue
3837 style = styles.get(ele.get('style'))
3838 if not style:
3839 continue
3840 default_style.update(style)
3841
bf6427d2 3842 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3843 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3844 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3845 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3846 if begin_time is None:
3847 continue
7dff0363 3848 if not end_time:
d631d5f9
YCH
3849 if not dur:
3850 continue
3851 end_time = begin_time + dur
bf6427d2
YCH
3852 out.append('%d\n%s --> %s\n%s\n\n' % (
3853 index,
c1c924ab
YCH
3854 srt_subtitles_timecode(begin_time),
3855 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3856 parse_node(para)))
3857
3858 return ''.join(out)
3859
3860
c487cf00 3861def cli_option(params, command_option, param, separator=None):
66e289ba 3862 param = params.get(param)
c487cf00 3863 return ([] if param is None
3864 else [command_option, str(param)] if separator is None
3865 else [f'{command_option}{separator}{param}'])
66e289ba
S
3866
3867
3868def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3869 param = params.get(param)
c487cf00 3870 assert param in (True, False, None)
3871 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3872
3873
3874def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3875 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3876
3877
e92caff5 3878def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3879 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3880 if use_compat:
5b1ecbb3 3881 return argdict
3882 else:
3883 argdict = None
eab9b2bc 3884 if argdict is None:
5b1ecbb3 3885 return default
eab9b2bc 3886 assert isinstance(argdict, dict)
3887
e92caff5 3888 assert isinstance(keys, (list, tuple))
3889 for key_list in keys:
e92caff5 3890 arg_list = list(filter(
3891 lambda x: x is not None,
6606817a 3892 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3893 if arg_list:
3894 return [arg for args in arg_list for arg in args]
3895 return default
66e289ba 3896
6251555f 3897
330690a2 3898def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3899 main_key, exe = main_key.lower(), exe.lower()
3900 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3901 keys = [f'{root_key}{k}' for k in (keys or [''])]
3902 if root_key in keys:
3903 if main_key != exe:
3904 keys.append((main_key, exe))
3905 keys.append('default')
3906 else:
3907 use_compat = False
3908 return cli_configuration_args(argdict, keys, default, use_compat)
3909
66e289ba 3910
86e5f3ed 3911class ISO639Utils:
39672624
YCH
3912 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3913 _lang_map = {
3914 'aa': 'aar',
3915 'ab': 'abk',
3916 'ae': 'ave',
3917 'af': 'afr',
3918 'ak': 'aka',
3919 'am': 'amh',
3920 'an': 'arg',
3921 'ar': 'ara',
3922 'as': 'asm',
3923 'av': 'ava',
3924 'ay': 'aym',
3925 'az': 'aze',
3926 'ba': 'bak',
3927 'be': 'bel',
3928 'bg': 'bul',
3929 'bh': 'bih',
3930 'bi': 'bis',
3931 'bm': 'bam',
3932 'bn': 'ben',
3933 'bo': 'bod',
3934 'br': 'bre',
3935 'bs': 'bos',
3936 'ca': 'cat',
3937 'ce': 'che',
3938 'ch': 'cha',
3939 'co': 'cos',
3940 'cr': 'cre',
3941 'cs': 'ces',
3942 'cu': 'chu',
3943 'cv': 'chv',
3944 'cy': 'cym',
3945 'da': 'dan',
3946 'de': 'deu',
3947 'dv': 'div',
3948 'dz': 'dzo',
3949 'ee': 'ewe',
3950 'el': 'ell',
3951 'en': 'eng',
3952 'eo': 'epo',
3953 'es': 'spa',
3954 'et': 'est',
3955 'eu': 'eus',
3956 'fa': 'fas',
3957 'ff': 'ful',
3958 'fi': 'fin',
3959 'fj': 'fij',
3960 'fo': 'fao',
3961 'fr': 'fra',
3962 'fy': 'fry',
3963 'ga': 'gle',
3964 'gd': 'gla',
3965 'gl': 'glg',
3966 'gn': 'grn',
3967 'gu': 'guj',
3968 'gv': 'glv',
3969 'ha': 'hau',
3970 'he': 'heb',
b7acc835 3971 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3972 'hi': 'hin',
3973 'ho': 'hmo',
3974 'hr': 'hrv',
3975 'ht': 'hat',
3976 'hu': 'hun',
3977 'hy': 'hye',
3978 'hz': 'her',
3979 'ia': 'ina',
3980 'id': 'ind',
b7acc835 3981 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3982 'ie': 'ile',
3983 'ig': 'ibo',
3984 'ii': 'iii',
3985 'ik': 'ipk',
3986 'io': 'ido',
3987 'is': 'isl',
3988 'it': 'ita',
3989 'iu': 'iku',
3990 'ja': 'jpn',
3991 'jv': 'jav',
3992 'ka': 'kat',
3993 'kg': 'kon',
3994 'ki': 'kik',
3995 'kj': 'kua',
3996 'kk': 'kaz',
3997 'kl': 'kal',
3998 'km': 'khm',
3999 'kn': 'kan',
4000 'ko': 'kor',
4001 'kr': 'kau',
4002 'ks': 'kas',
4003 'ku': 'kur',
4004 'kv': 'kom',
4005 'kw': 'cor',
4006 'ky': 'kir',
4007 'la': 'lat',
4008 'lb': 'ltz',
4009 'lg': 'lug',
4010 'li': 'lim',
4011 'ln': 'lin',
4012 'lo': 'lao',
4013 'lt': 'lit',
4014 'lu': 'lub',
4015 'lv': 'lav',
4016 'mg': 'mlg',
4017 'mh': 'mah',
4018 'mi': 'mri',
4019 'mk': 'mkd',
4020 'ml': 'mal',
4021 'mn': 'mon',
4022 'mr': 'mar',
4023 'ms': 'msa',
4024 'mt': 'mlt',
4025 'my': 'mya',
4026 'na': 'nau',
4027 'nb': 'nob',
4028 'nd': 'nde',
4029 'ne': 'nep',
4030 'ng': 'ndo',
4031 'nl': 'nld',
4032 'nn': 'nno',
4033 'no': 'nor',
4034 'nr': 'nbl',
4035 'nv': 'nav',
4036 'ny': 'nya',
4037 'oc': 'oci',
4038 'oj': 'oji',
4039 'om': 'orm',
4040 'or': 'ori',
4041 'os': 'oss',
4042 'pa': 'pan',
4043 'pi': 'pli',
4044 'pl': 'pol',
4045 'ps': 'pus',
4046 'pt': 'por',
4047 'qu': 'que',
4048 'rm': 'roh',
4049 'rn': 'run',
4050 'ro': 'ron',
4051 'ru': 'rus',
4052 'rw': 'kin',
4053 'sa': 'san',
4054 'sc': 'srd',
4055 'sd': 'snd',
4056 'se': 'sme',
4057 'sg': 'sag',
4058 'si': 'sin',
4059 'sk': 'slk',
4060 'sl': 'slv',
4061 'sm': 'smo',
4062 'sn': 'sna',
4063 'so': 'som',
4064 'sq': 'sqi',
4065 'sr': 'srp',
4066 'ss': 'ssw',
4067 'st': 'sot',
4068 'su': 'sun',
4069 'sv': 'swe',
4070 'sw': 'swa',
4071 'ta': 'tam',
4072 'te': 'tel',
4073 'tg': 'tgk',
4074 'th': 'tha',
4075 'ti': 'tir',
4076 'tk': 'tuk',
4077 'tl': 'tgl',
4078 'tn': 'tsn',
4079 'to': 'ton',
4080 'tr': 'tur',
4081 'ts': 'tso',
4082 'tt': 'tat',
4083 'tw': 'twi',
4084 'ty': 'tah',
4085 'ug': 'uig',
4086 'uk': 'ukr',
4087 'ur': 'urd',
4088 'uz': 'uzb',
4089 've': 'ven',
4090 'vi': 'vie',
4091 'vo': 'vol',
4092 'wa': 'wln',
4093 'wo': 'wol',
4094 'xh': 'xho',
4095 'yi': 'yid',
e9a50fba 4096 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4097 'yo': 'yor',
4098 'za': 'zha',
4099 'zh': 'zho',
4100 'zu': 'zul',
4101 }
4102
4103 @classmethod
4104 def short2long(cls, code):
4105 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4106 return cls._lang_map.get(code[:2])
4107
4108 @classmethod
4109 def long2short(cls, code):
4110 """Convert language code from ISO 639-2/T to ISO 639-1"""
4111 for short_name, long_name in cls._lang_map.items():
4112 if long_name == code:
4113 return short_name
4114
4115
86e5f3ed 4116class ISO3166Utils:
4eb10f66
YCH
4117 # From http://data.okfn.org/data/core/country-list
4118 _country_map = {
4119 'AF': 'Afghanistan',
4120 'AX': 'Åland Islands',
4121 'AL': 'Albania',
4122 'DZ': 'Algeria',
4123 'AS': 'American Samoa',
4124 'AD': 'Andorra',
4125 'AO': 'Angola',
4126 'AI': 'Anguilla',
4127 'AQ': 'Antarctica',
4128 'AG': 'Antigua and Barbuda',
4129 'AR': 'Argentina',
4130 'AM': 'Armenia',
4131 'AW': 'Aruba',
4132 'AU': 'Australia',
4133 'AT': 'Austria',
4134 'AZ': 'Azerbaijan',
4135 'BS': 'Bahamas',
4136 'BH': 'Bahrain',
4137 'BD': 'Bangladesh',
4138 'BB': 'Barbados',
4139 'BY': 'Belarus',
4140 'BE': 'Belgium',
4141 'BZ': 'Belize',
4142 'BJ': 'Benin',
4143 'BM': 'Bermuda',
4144 'BT': 'Bhutan',
4145 'BO': 'Bolivia, Plurinational State of',
4146 'BQ': 'Bonaire, Sint Eustatius and Saba',
4147 'BA': 'Bosnia and Herzegovina',
4148 'BW': 'Botswana',
4149 'BV': 'Bouvet Island',
4150 'BR': 'Brazil',
4151 'IO': 'British Indian Ocean Territory',
4152 'BN': 'Brunei Darussalam',
4153 'BG': 'Bulgaria',
4154 'BF': 'Burkina Faso',
4155 'BI': 'Burundi',
4156 'KH': 'Cambodia',
4157 'CM': 'Cameroon',
4158 'CA': 'Canada',
4159 'CV': 'Cape Verde',
4160 'KY': 'Cayman Islands',
4161 'CF': 'Central African Republic',
4162 'TD': 'Chad',
4163 'CL': 'Chile',
4164 'CN': 'China',
4165 'CX': 'Christmas Island',
4166 'CC': 'Cocos (Keeling) Islands',
4167 'CO': 'Colombia',
4168 'KM': 'Comoros',
4169 'CG': 'Congo',
4170 'CD': 'Congo, the Democratic Republic of the',
4171 'CK': 'Cook Islands',
4172 'CR': 'Costa Rica',
4173 'CI': 'Côte d\'Ivoire',
4174 'HR': 'Croatia',
4175 'CU': 'Cuba',
4176 'CW': 'Curaçao',
4177 'CY': 'Cyprus',
4178 'CZ': 'Czech Republic',
4179 'DK': 'Denmark',
4180 'DJ': 'Djibouti',
4181 'DM': 'Dominica',
4182 'DO': 'Dominican Republic',
4183 'EC': 'Ecuador',
4184 'EG': 'Egypt',
4185 'SV': 'El Salvador',
4186 'GQ': 'Equatorial Guinea',
4187 'ER': 'Eritrea',
4188 'EE': 'Estonia',
4189 'ET': 'Ethiopia',
4190 'FK': 'Falkland Islands (Malvinas)',
4191 'FO': 'Faroe Islands',
4192 'FJ': 'Fiji',
4193 'FI': 'Finland',
4194 'FR': 'France',
4195 'GF': 'French Guiana',
4196 'PF': 'French Polynesia',
4197 'TF': 'French Southern Territories',
4198 'GA': 'Gabon',
4199 'GM': 'Gambia',
4200 'GE': 'Georgia',
4201 'DE': 'Germany',
4202 'GH': 'Ghana',
4203 'GI': 'Gibraltar',
4204 'GR': 'Greece',
4205 'GL': 'Greenland',
4206 'GD': 'Grenada',
4207 'GP': 'Guadeloupe',
4208 'GU': 'Guam',
4209 'GT': 'Guatemala',
4210 'GG': 'Guernsey',
4211 'GN': 'Guinea',
4212 'GW': 'Guinea-Bissau',
4213 'GY': 'Guyana',
4214 'HT': 'Haiti',
4215 'HM': 'Heard Island and McDonald Islands',
4216 'VA': 'Holy See (Vatican City State)',
4217 'HN': 'Honduras',
4218 'HK': 'Hong Kong',
4219 'HU': 'Hungary',
4220 'IS': 'Iceland',
4221 'IN': 'India',
4222 'ID': 'Indonesia',
4223 'IR': 'Iran, Islamic Republic of',
4224 'IQ': 'Iraq',
4225 'IE': 'Ireland',
4226 'IM': 'Isle of Man',
4227 'IL': 'Israel',
4228 'IT': 'Italy',
4229 'JM': 'Jamaica',
4230 'JP': 'Japan',
4231 'JE': 'Jersey',
4232 'JO': 'Jordan',
4233 'KZ': 'Kazakhstan',
4234 'KE': 'Kenya',
4235 'KI': 'Kiribati',
4236 'KP': 'Korea, Democratic People\'s Republic of',
4237 'KR': 'Korea, Republic of',
4238 'KW': 'Kuwait',
4239 'KG': 'Kyrgyzstan',
4240 'LA': 'Lao People\'s Democratic Republic',
4241 'LV': 'Latvia',
4242 'LB': 'Lebanon',
4243 'LS': 'Lesotho',
4244 'LR': 'Liberia',
4245 'LY': 'Libya',
4246 'LI': 'Liechtenstein',
4247 'LT': 'Lithuania',
4248 'LU': 'Luxembourg',
4249 'MO': 'Macao',
4250 'MK': 'Macedonia, the Former Yugoslav Republic of',
4251 'MG': 'Madagascar',
4252 'MW': 'Malawi',
4253 'MY': 'Malaysia',
4254 'MV': 'Maldives',
4255 'ML': 'Mali',
4256 'MT': 'Malta',
4257 'MH': 'Marshall Islands',
4258 'MQ': 'Martinique',
4259 'MR': 'Mauritania',
4260 'MU': 'Mauritius',
4261 'YT': 'Mayotte',
4262 'MX': 'Mexico',
4263 'FM': 'Micronesia, Federated States of',
4264 'MD': 'Moldova, Republic of',
4265 'MC': 'Monaco',
4266 'MN': 'Mongolia',
4267 'ME': 'Montenegro',
4268 'MS': 'Montserrat',
4269 'MA': 'Morocco',
4270 'MZ': 'Mozambique',
4271 'MM': 'Myanmar',
4272 'NA': 'Namibia',
4273 'NR': 'Nauru',
4274 'NP': 'Nepal',
4275 'NL': 'Netherlands',
4276 'NC': 'New Caledonia',
4277 'NZ': 'New Zealand',
4278 'NI': 'Nicaragua',
4279 'NE': 'Niger',
4280 'NG': 'Nigeria',
4281 'NU': 'Niue',
4282 'NF': 'Norfolk Island',
4283 'MP': 'Northern Mariana Islands',
4284 'NO': 'Norway',
4285 'OM': 'Oman',
4286 'PK': 'Pakistan',
4287 'PW': 'Palau',
4288 'PS': 'Palestine, State of',
4289 'PA': 'Panama',
4290 'PG': 'Papua New Guinea',
4291 'PY': 'Paraguay',
4292 'PE': 'Peru',
4293 'PH': 'Philippines',
4294 'PN': 'Pitcairn',
4295 'PL': 'Poland',
4296 'PT': 'Portugal',
4297 'PR': 'Puerto Rico',
4298 'QA': 'Qatar',
4299 'RE': 'Réunion',
4300 'RO': 'Romania',
4301 'RU': 'Russian Federation',
4302 'RW': 'Rwanda',
4303 'BL': 'Saint Barthélemy',
4304 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4305 'KN': 'Saint Kitts and Nevis',
4306 'LC': 'Saint Lucia',
4307 'MF': 'Saint Martin (French part)',
4308 'PM': 'Saint Pierre and Miquelon',
4309 'VC': 'Saint Vincent and the Grenadines',
4310 'WS': 'Samoa',
4311 'SM': 'San Marino',
4312 'ST': 'Sao Tome and Principe',
4313 'SA': 'Saudi Arabia',
4314 'SN': 'Senegal',
4315 'RS': 'Serbia',
4316 'SC': 'Seychelles',
4317 'SL': 'Sierra Leone',
4318 'SG': 'Singapore',
4319 'SX': 'Sint Maarten (Dutch part)',
4320 'SK': 'Slovakia',
4321 'SI': 'Slovenia',
4322 'SB': 'Solomon Islands',
4323 'SO': 'Somalia',
4324 'ZA': 'South Africa',
4325 'GS': 'South Georgia and the South Sandwich Islands',
4326 'SS': 'South Sudan',
4327 'ES': 'Spain',
4328 'LK': 'Sri Lanka',
4329 'SD': 'Sudan',
4330 'SR': 'Suriname',
4331 'SJ': 'Svalbard and Jan Mayen',
4332 'SZ': 'Swaziland',
4333 'SE': 'Sweden',
4334 'CH': 'Switzerland',
4335 'SY': 'Syrian Arab Republic',
4336 'TW': 'Taiwan, Province of China',
4337 'TJ': 'Tajikistan',
4338 'TZ': 'Tanzania, United Republic of',
4339 'TH': 'Thailand',
4340 'TL': 'Timor-Leste',
4341 'TG': 'Togo',
4342 'TK': 'Tokelau',
4343 'TO': 'Tonga',
4344 'TT': 'Trinidad and Tobago',
4345 'TN': 'Tunisia',
4346 'TR': 'Turkey',
4347 'TM': 'Turkmenistan',
4348 'TC': 'Turks and Caicos Islands',
4349 'TV': 'Tuvalu',
4350 'UG': 'Uganda',
4351 'UA': 'Ukraine',
4352 'AE': 'United Arab Emirates',
4353 'GB': 'United Kingdom',
4354 'US': 'United States',
4355 'UM': 'United States Minor Outlying Islands',
4356 'UY': 'Uruguay',
4357 'UZ': 'Uzbekistan',
4358 'VU': 'Vanuatu',
4359 'VE': 'Venezuela, Bolivarian Republic of',
4360 'VN': 'Viet Nam',
4361 'VG': 'Virgin Islands, British',
4362 'VI': 'Virgin Islands, U.S.',
4363 'WF': 'Wallis and Futuna',
4364 'EH': 'Western Sahara',
4365 'YE': 'Yemen',
4366 'ZM': 'Zambia',
4367 'ZW': 'Zimbabwe',
2f97cc61 4368 # Not ISO 3166 codes, but used for IP blocks
4369 'AP': 'Asia/Pacific Region',
4370 'EU': 'Europe',
4eb10f66
YCH
4371 }
4372
4373 @classmethod
4374 def short2full(cls, code):
4375 """Convert an ISO 3166-2 country code to the corresponding full name"""
4376 return cls._country_map.get(code.upper())
4377
4378
86e5f3ed 4379class GeoUtils:
773f291d
S
4380 # Major IPv4 address blocks per country
4381 _country_ip_map = {
53896ca5 4382 'AD': '46.172.224.0/19',
773f291d
S
4383 'AE': '94.200.0.0/13',
4384 'AF': '149.54.0.0/17',
4385 'AG': '209.59.64.0/18',
4386 'AI': '204.14.248.0/21',
4387 'AL': '46.99.0.0/16',
4388 'AM': '46.70.0.0/15',
4389 'AO': '105.168.0.0/13',
53896ca5
S
4390 'AP': '182.50.184.0/21',
4391 'AQ': '23.154.160.0/24',
773f291d
S
4392 'AR': '181.0.0.0/12',
4393 'AS': '202.70.112.0/20',
53896ca5 4394 'AT': '77.116.0.0/14',
773f291d
S
4395 'AU': '1.128.0.0/11',
4396 'AW': '181.41.0.0/18',
53896ca5
S
4397 'AX': '185.217.4.0/22',
4398 'AZ': '5.197.0.0/16',
773f291d
S
4399 'BA': '31.176.128.0/17',
4400 'BB': '65.48.128.0/17',
4401 'BD': '114.130.0.0/16',
4402 'BE': '57.0.0.0/8',
53896ca5 4403 'BF': '102.178.0.0/15',
773f291d
S
4404 'BG': '95.42.0.0/15',
4405 'BH': '37.131.0.0/17',
4406 'BI': '154.117.192.0/18',
4407 'BJ': '137.255.0.0/16',
53896ca5 4408 'BL': '185.212.72.0/23',
773f291d
S
4409 'BM': '196.12.64.0/18',
4410 'BN': '156.31.0.0/16',
4411 'BO': '161.56.0.0/16',
4412 'BQ': '161.0.80.0/20',
53896ca5 4413 'BR': '191.128.0.0/12',
773f291d
S
4414 'BS': '24.51.64.0/18',
4415 'BT': '119.2.96.0/19',
4416 'BW': '168.167.0.0/16',
4417 'BY': '178.120.0.0/13',
4418 'BZ': '179.42.192.0/18',
4419 'CA': '99.224.0.0/11',
4420 'CD': '41.243.0.0/16',
53896ca5
S
4421 'CF': '197.242.176.0/21',
4422 'CG': '160.113.0.0/16',
773f291d 4423 'CH': '85.0.0.0/13',
53896ca5 4424 'CI': '102.136.0.0/14',
773f291d
S
4425 'CK': '202.65.32.0/19',
4426 'CL': '152.172.0.0/14',
53896ca5 4427 'CM': '102.244.0.0/14',
773f291d
S
4428 'CN': '36.128.0.0/10',
4429 'CO': '181.240.0.0/12',
4430 'CR': '201.192.0.0/12',
4431 'CU': '152.206.0.0/15',
4432 'CV': '165.90.96.0/19',
4433 'CW': '190.88.128.0/17',
53896ca5 4434 'CY': '31.153.0.0/16',
773f291d
S
4435 'CZ': '88.100.0.0/14',
4436 'DE': '53.0.0.0/8',
4437 'DJ': '197.241.0.0/17',
4438 'DK': '87.48.0.0/12',
4439 'DM': '192.243.48.0/20',
4440 'DO': '152.166.0.0/15',
4441 'DZ': '41.96.0.0/12',
4442 'EC': '186.68.0.0/15',
4443 'EE': '90.190.0.0/15',
4444 'EG': '156.160.0.0/11',
4445 'ER': '196.200.96.0/20',
4446 'ES': '88.0.0.0/11',
4447 'ET': '196.188.0.0/14',
4448 'EU': '2.16.0.0/13',
4449 'FI': '91.152.0.0/13',
4450 'FJ': '144.120.0.0/16',
53896ca5 4451 'FK': '80.73.208.0/21',
773f291d
S
4452 'FM': '119.252.112.0/20',
4453 'FO': '88.85.32.0/19',
4454 'FR': '90.0.0.0/9',
4455 'GA': '41.158.0.0/15',
4456 'GB': '25.0.0.0/8',
4457 'GD': '74.122.88.0/21',
4458 'GE': '31.146.0.0/16',
4459 'GF': '161.22.64.0/18',
4460 'GG': '62.68.160.0/19',
53896ca5
S
4461 'GH': '154.160.0.0/12',
4462 'GI': '95.164.0.0/16',
773f291d
S
4463 'GL': '88.83.0.0/19',
4464 'GM': '160.182.0.0/15',
4465 'GN': '197.149.192.0/18',
4466 'GP': '104.250.0.0/19',
4467 'GQ': '105.235.224.0/20',
4468 'GR': '94.64.0.0/13',
4469 'GT': '168.234.0.0/16',
4470 'GU': '168.123.0.0/16',
4471 'GW': '197.214.80.0/20',
4472 'GY': '181.41.64.0/18',
4473 'HK': '113.252.0.0/14',
4474 'HN': '181.210.0.0/16',
4475 'HR': '93.136.0.0/13',
4476 'HT': '148.102.128.0/17',
4477 'HU': '84.0.0.0/14',
4478 'ID': '39.192.0.0/10',
4479 'IE': '87.32.0.0/12',
4480 'IL': '79.176.0.0/13',
4481 'IM': '5.62.80.0/20',
4482 'IN': '117.192.0.0/10',
4483 'IO': '203.83.48.0/21',
4484 'IQ': '37.236.0.0/14',
4485 'IR': '2.176.0.0/12',
4486 'IS': '82.221.0.0/16',
4487 'IT': '79.0.0.0/10',
4488 'JE': '87.244.64.0/18',
4489 'JM': '72.27.0.0/17',
4490 'JO': '176.29.0.0/16',
53896ca5 4491 'JP': '133.0.0.0/8',
773f291d
S
4492 'KE': '105.48.0.0/12',
4493 'KG': '158.181.128.0/17',
4494 'KH': '36.37.128.0/17',
4495 'KI': '103.25.140.0/22',
4496 'KM': '197.255.224.0/20',
53896ca5 4497 'KN': '198.167.192.0/19',
773f291d
S
4498 'KP': '175.45.176.0/22',
4499 'KR': '175.192.0.0/10',
4500 'KW': '37.36.0.0/14',
4501 'KY': '64.96.0.0/15',
4502 'KZ': '2.72.0.0/13',
4503 'LA': '115.84.64.0/18',
4504 'LB': '178.135.0.0/16',
53896ca5 4505 'LC': '24.92.144.0/20',
773f291d
S
4506 'LI': '82.117.0.0/19',
4507 'LK': '112.134.0.0/15',
53896ca5 4508 'LR': '102.183.0.0/16',
773f291d
S
4509 'LS': '129.232.0.0/17',
4510 'LT': '78.56.0.0/13',
4511 'LU': '188.42.0.0/16',
4512 'LV': '46.109.0.0/16',
4513 'LY': '41.252.0.0/14',
4514 'MA': '105.128.0.0/11',
4515 'MC': '88.209.64.0/18',
4516 'MD': '37.246.0.0/16',
4517 'ME': '178.175.0.0/17',
4518 'MF': '74.112.232.0/21',
4519 'MG': '154.126.0.0/17',
4520 'MH': '117.103.88.0/21',
4521 'MK': '77.28.0.0/15',
4522 'ML': '154.118.128.0/18',
4523 'MM': '37.111.0.0/17',
4524 'MN': '49.0.128.0/17',
4525 'MO': '60.246.0.0/16',
4526 'MP': '202.88.64.0/20',
4527 'MQ': '109.203.224.0/19',
4528 'MR': '41.188.64.0/18',
4529 'MS': '208.90.112.0/22',
4530 'MT': '46.11.0.0/16',
4531 'MU': '105.16.0.0/12',
4532 'MV': '27.114.128.0/18',
53896ca5 4533 'MW': '102.70.0.0/15',
773f291d
S
4534 'MX': '187.192.0.0/11',
4535 'MY': '175.136.0.0/13',
4536 'MZ': '197.218.0.0/15',
4537 'NA': '41.182.0.0/16',
4538 'NC': '101.101.0.0/18',
4539 'NE': '197.214.0.0/18',
4540 'NF': '203.17.240.0/22',
4541 'NG': '105.112.0.0/12',
4542 'NI': '186.76.0.0/15',
4543 'NL': '145.96.0.0/11',
4544 'NO': '84.208.0.0/13',
4545 'NP': '36.252.0.0/15',
4546 'NR': '203.98.224.0/19',
4547 'NU': '49.156.48.0/22',
4548 'NZ': '49.224.0.0/14',
4549 'OM': '5.36.0.0/15',
4550 'PA': '186.72.0.0/15',
4551 'PE': '186.160.0.0/14',
4552 'PF': '123.50.64.0/18',
4553 'PG': '124.240.192.0/19',
4554 'PH': '49.144.0.0/13',
4555 'PK': '39.32.0.0/11',
4556 'PL': '83.0.0.0/11',
4557 'PM': '70.36.0.0/20',
4558 'PR': '66.50.0.0/16',
4559 'PS': '188.161.0.0/16',
4560 'PT': '85.240.0.0/13',
4561 'PW': '202.124.224.0/20',
4562 'PY': '181.120.0.0/14',
4563 'QA': '37.210.0.0/15',
53896ca5 4564 'RE': '102.35.0.0/16',
773f291d 4565 'RO': '79.112.0.0/13',
53896ca5 4566 'RS': '93.86.0.0/15',
773f291d 4567 'RU': '5.136.0.0/13',
53896ca5 4568 'RW': '41.186.0.0/16',
773f291d
S
4569 'SA': '188.48.0.0/13',
4570 'SB': '202.1.160.0/19',
4571 'SC': '154.192.0.0/11',
53896ca5 4572 'SD': '102.120.0.0/13',
773f291d 4573 'SE': '78.64.0.0/12',
53896ca5 4574 'SG': '8.128.0.0/10',
773f291d
S
4575 'SI': '188.196.0.0/14',
4576 'SK': '78.98.0.0/15',
53896ca5 4577 'SL': '102.143.0.0/17',
773f291d
S
4578 'SM': '89.186.32.0/19',
4579 'SN': '41.82.0.0/15',
53896ca5 4580 'SO': '154.115.192.0/18',
773f291d
S
4581 'SR': '186.179.128.0/17',
4582 'SS': '105.235.208.0/21',
4583 'ST': '197.159.160.0/19',
4584 'SV': '168.243.0.0/16',
4585 'SX': '190.102.0.0/20',
4586 'SY': '5.0.0.0/16',
4587 'SZ': '41.84.224.0/19',
4588 'TC': '65.255.48.0/20',
4589 'TD': '154.68.128.0/19',
4590 'TG': '196.168.0.0/14',
4591 'TH': '171.96.0.0/13',
4592 'TJ': '85.9.128.0/18',
4593 'TK': '27.96.24.0/21',
4594 'TL': '180.189.160.0/20',
4595 'TM': '95.85.96.0/19',
4596 'TN': '197.0.0.0/11',
4597 'TO': '175.176.144.0/21',
4598 'TR': '78.160.0.0/11',
4599 'TT': '186.44.0.0/15',
4600 'TV': '202.2.96.0/19',
4601 'TW': '120.96.0.0/11',
4602 'TZ': '156.156.0.0/14',
53896ca5
S
4603 'UA': '37.52.0.0/14',
4604 'UG': '102.80.0.0/13',
4605 'US': '6.0.0.0/8',
773f291d 4606 'UY': '167.56.0.0/13',
53896ca5 4607 'UZ': '84.54.64.0/18',
773f291d 4608 'VA': '212.77.0.0/19',
53896ca5 4609 'VC': '207.191.240.0/21',
773f291d 4610 'VE': '186.88.0.0/13',
53896ca5 4611 'VG': '66.81.192.0/20',
773f291d
S
4612 'VI': '146.226.0.0/16',
4613 'VN': '14.160.0.0/11',
4614 'VU': '202.80.32.0/20',
4615 'WF': '117.20.32.0/21',
4616 'WS': '202.4.32.0/19',
4617 'YE': '134.35.0.0/16',
4618 'YT': '41.242.116.0/22',
4619 'ZA': '41.0.0.0/11',
53896ca5
S
4620 'ZM': '102.144.0.0/13',
4621 'ZW': '102.177.192.0/18',
773f291d
S
4622 }
4623
4624 @classmethod
5f95927a
S
4625 def random_ipv4(cls, code_or_block):
4626 if len(code_or_block) == 2:
4627 block = cls._country_ip_map.get(code_or_block.upper())
4628 if not block:
4629 return None
4630 else:
4631 block = code_or_block
773f291d 4632 addr, preflen = block.split('/')
ac668111 4633 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4634 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4635 return compat_str(socket.inet_ntoa(
ac668111 4636 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4637
4638
ac668111 4639class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4640 def __init__(self, proxies=None):
4641 # Set default handlers
4642 for type in ('http', 'https'):
4643 setattr(self, '%s_open' % type,
4644 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4645 meth(r, proxy, type))
ac668111 4646 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4647
91410c9b 4648 def proxy_open(self, req, proxy, type):
2461f79d 4649 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4650 if req_proxy is not None:
4651 proxy = req_proxy
2461f79d
PH
4652 del req.headers['Ytdl-request-proxy']
4653
4654 if proxy == '__noproxy__':
4655 return None # No Proxy
51fb4995 4656 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4657 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4658 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4659 return None
ac668111 4660 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4661 self, req, proxy, type)
5bc880b9
YCH
4662
4663
0a5445dd
YCH
4664# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4665# released into Public Domain
4666# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4667
4668def long_to_bytes(n, blocksize=0):
4669 """long_to_bytes(n:long, blocksize:int) : string
4670 Convert a long integer to a byte string.
4671
4672 If optional blocksize is given and greater than zero, pad the front of the
4673 byte string with binary zeros so that the length is a multiple of
4674 blocksize.
4675 """
4676 # after much testing, this algorithm was deemed to be the fastest
4677 s = b''
4678 n = int(n)
4679 while n > 0:
ac668111 4680 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4681 n = n >> 32
4682 # strip off leading zeros
4683 for i in range(len(s)):
4684 if s[i] != b'\000'[0]:
4685 break
4686 else:
4687 # only happens when n == 0
4688 s = b'\000'
4689 i = 0
4690 s = s[i:]
4691 # add back some pad bytes. this could be done more efficiently w.r.t. the
4692 # de-padding being done above, but sigh...
4693 if blocksize > 0 and len(s) % blocksize:
4694 s = (blocksize - len(s) % blocksize) * b'\000' + s
4695 return s
4696
4697
4698def bytes_to_long(s):
4699 """bytes_to_long(string) : long
4700 Convert a byte string to a long integer.
4701
4702 This is (essentially) the inverse of long_to_bytes().
4703 """
4704 acc = 0
4705 length = len(s)
4706 if length % 4:
4707 extra = (4 - length % 4)
4708 s = b'\000' * extra + s
4709 length = length + extra
4710 for i in range(0, length, 4):
ac668111 4711 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4712 return acc
4713
4714
5bc880b9
YCH
4715def ohdave_rsa_encrypt(data, exponent, modulus):
4716 '''
4717 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4718
4719 Input:
4720 data: data to encrypt, bytes-like object
4721 exponent, modulus: parameter e and N of RSA algorithm, both integer
4722 Output: hex string of encrypted data
4723
4724 Limitation: supports one block encryption only
4725 '''
4726
4727 payload = int(binascii.hexlify(data[::-1]), 16)
4728 encrypted = pow(payload, exponent, modulus)
4729 return '%x' % encrypted
81bdc8fd
YCH
4730
4731
f48409c7
YCH
4732def pkcs1pad(data, length):
4733 """
4734 Padding input data with PKCS#1 scheme
4735
4736 @param {int[]} data input data
4737 @param {int} length target length
4738 @returns {int[]} padded data
4739 """
4740 if len(data) > length - 11:
4741 raise ValueError('Input data too long for PKCS#1 padding')
4742
4743 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4744 return [0, 2] + pseudo_random + [0] + data
4745
4746
7b2c3f47 4747def _base_n_table(n, table):
4748 if not table and not n:
4749 raise ValueError('Either table or n must be specified')
612f2be5 4750 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4751
4752 if n != len(table):
4753 raise ValueError(f'base {n} exceeds table length {len(table)}')
4754 return table
59f898b7 4755
5eb6bdce 4756
7b2c3f47 4757def encode_base_n(num, n=None, table=None):
4758 """Convert given int to a base-n string"""
612f2be5 4759 table = _base_n_table(n, table)
7b2c3f47 4760 if not num:
5eb6bdce
YCH
4761 return table[0]
4762
7b2c3f47 4763 result, base = '', len(table)
81bdc8fd 4764 while num:
7b2c3f47 4765 result = table[num % base] + result
612f2be5 4766 num = num // base
7b2c3f47 4767 return result
4768
4769
4770def decode_base_n(string, n=None, table=None):
4771 """Convert given base-n string to int"""
4772 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4773 result, base = 0, len(table)
4774 for char in string:
4775 result = result * base + table[char]
4776 return result
4777
4778
4779def decode_base(value, digits):
4780 write_string('DeprecationWarning: yt_dlp.utils.decode_base is deprecated '
4781 'and may be removed in a future version. Use yt_dlp.decode_base_n instead')
4782 return decode_base_n(value, table=digits)
f52354a8
YCH
4783
4784
4785def decode_packed_codes(code):
06b3fe29 4786 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4787 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4788 base = int(base)
4789 count = int(count)
4790 symbols = symbols.split('|')
4791 symbol_table = {}
4792
4793 while count:
4794 count -= 1
5eb6bdce 4795 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4796 symbol_table[base_n_count] = symbols[count] or base_n_count
4797
4798 return re.sub(
4799 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4800 obfuscated_code)
e154c651 4801
4802
1ced2221
S
4803def caesar(s, alphabet, shift):
4804 if shift == 0:
4805 return s
4806 l = len(alphabet)
4807 return ''.join(
4808 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4809 for c in s)
4810
4811
4812def rot47(s):
4813 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4814
4815
e154c651 4816def parse_m3u8_attributes(attrib):
4817 info = {}
4818 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4819 if val.startswith('"'):
4820 val = val[1:-1]
4821 info[key] = val
4822 return info
1143535d
YCH
4823
4824
4825def urshift(val, n):
4826 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4827
4828
4829# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4830# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4831def decode_png(png_data):
4832 # Reference: https://www.w3.org/TR/PNG/
4833 header = png_data[8:]
4834
4835 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4836 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4837
4838 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 4839 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
4840
4841 chunks = []
4842
4843 while header:
4844 length = unpack_integer(header[:4])
4845 header = header[4:]
4846
4847 chunk_type = header[:4]
4848 header = header[4:]
4849
4850 chunk_data = header[:length]
4851 header = header[length:]
4852
4853 header = header[4:] # Skip CRC
4854
4855 chunks.append({
4856 'type': chunk_type,
4857 'length': length,
4858 'data': chunk_data
4859 })
4860
4861 ihdr = chunks[0]['data']
4862
4863 width = unpack_integer(ihdr[:4])
4864 height = unpack_integer(ihdr[4:8])
4865
4866 idat = b''
4867
4868 for chunk in chunks:
4869 if chunk['type'] == b'IDAT':
4870 idat += chunk['data']
4871
4872 if not idat:
86e5f3ed 4873 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4874
4875 decompressed_data = bytearray(zlib.decompress(idat))
4876
4877 stride = width * 3
4878 pixels = []
4879
4880 def _get_pixel(idx):
4881 x = idx % stride
4882 y = idx // stride
4883 return pixels[y][x]
4884
4885 for y in range(height):
4886 basePos = y * (1 + stride)
4887 filter_type = decompressed_data[basePos]
4888
4889 current_row = []
4890
4891 pixels.append(current_row)
4892
4893 for x in range(stride):
4894 color = decompressed_data[1 + basePos + x]
4895 basex = y * stride + x
4896 left = 0
4897 up = 0
4898
4899 if x > 2:
4900 left = _get_pixel(basex - 3)
4901 if y > 0:
4902 up = _get_pixel(basex - stride)
4903
4904 if filter_type == 1: # Sub
4905 color = (color + left) & 0xff
4906 elif filter_type == 2: # Up
4907 color = (color + up) & 0xff
4908 elif filter_type == 3: # Average
4909 color = (color + ((left + up) >> 1)) & 0xff
4910 elif filter_type == 4: # Paeth
4911 a = left
4912 b = up
4913 c = 0
4914
4915 if x > 2 and y > 0:
4916 c = _get_pixel(basex - stride - 3)
4917
4918 p = a + b - c
4919
4920 pa = abs(p - a)
4921 pb = abs(p - b)
4922 pc = abs(p - c)
4923
4924 if pa <= pb and pa <= pc:
4925 color = (color + a) & 0xff
4926 elif pb <= pc:
4927 color = (color + b) & 0xff
4928 else:
4929 color = (color + c) & 0xff
4930
4931 current_row.append(color)
4932
4933 return width, height, pixels
efa97bdc
YCH
4934
4935
4936def write_xattr(path, key, value):
6f7563be 4937 # Windows: Write xattrs to NTFS Alternate Data Streams:
4938 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4939 if compat_os_name == 'nt':
4940 assert ':' not in key
4941 assert os.path.exists(path)
efa97bdc
YCH
4942
4943 try:
6f7563be 4944 with open(f'{path}:{key}', 'wb') as f:
4945 f.write(value)
86e5f3ed 4946 except OSError as e:
efa97bdc 4947 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4948 return
efa97bdc 4949
6f7563be 4950 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 4951
6f7563be 4952 setxattr = None
4953 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4954 # Unicode arguments are not supported in pyxattr until version 0.5.0
4955 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4956 if version_tuple(xattr.__version__) >= (0, 5, 0):
4957 setxattr = xattr.set
4958 elif xattr:
4959 setxattr = xattr.setxattr
efa97bdc 4960
6f7563be 4961 if setxattr:
4962 try:
4963 setxattr(path, key, value)
4964 except OSError as e:
4965 raise XAttrMetadataError(e.errno, e.strerror)
4966 return
efa97bdc 4967
6f7563be 4968 # UNIX Method 2. Use setfattr/xattr executables
4969 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4970 else 'xattr' if check_executable('xattr', ['-h']) else None)
4971 if not exe:
4972 raise XAttrUnavailableError(
4973 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4974 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4975
0f06bcd7 4976 value = value.decode()
6f7563be 4977 try:
f0c9fb96 4978 _, stderr, returncode = Popen.run(
6f7563be 4979 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4980 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4981 except OSError as e:
4982 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4983 if returncode:
4984 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4985
4986
4987def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4988 start_date = datetime.date(1950, 1, 1)
4989 end_date = datetime.date(1995, 12, 31)
4990 offset = random.randint(0, (end_date - start_date).days)
4991 random_date = start_date + datetime.timedelta(offset)
0c265486 4992 return {
aa374bc7
AS
4993 year_field: str(random_date.year),
4994 month_field: str(random_date.month),
4995 day_field: str(random_date.day),
0c265486 4996 }
732044af 4997
c76eb41b 4998
732044af 4999# Templates for internet shortcut files, which are plain text files.
e5a998f3 5000DOT_URL_LINK_TEMPLATE = '''\
732044af 5001[InternetShortcut]
5002URL=%(url)s
e5a998f3 5003'''
732044af 5004
e5a998f3 5005DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5006<?xml version="1.0" encoding="UTF-8"?>
5007<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5008<plist version="1.0">
5009<dict>
5010\t<key>URL</key>
5011\t<string>%(url)s</string>
5012</dict>
5013</plist>
e5a998f3 5014'''
732044af 5015
e5a998f3 5016DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5017[Desktop Entry]
5018Encoding=UTF-8
5019Name=%(filename)s
5020Type=Link
5021URL=%(url)s
5022Icon=text-html
e5a998f3 5023'''
732044af 5024
08438d2c 5025LINK_TEMPLATES = {
5026 'url': DOT_URL_LINK_TEMPLATE,
5027 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5028 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5029}
5030
732044af 5031
5032def iri_to_uri(iri):
5033 """
5034 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5035
5036 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5037 """
5038
5039 iri_parts = compat_urllib_parse_urlparse(iri)
5040
5041 if '[' in iri_parts.netloc:
5042 raise ValueError('IPv6 URIs are not, yet, supported.')
5043 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5044
5045 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5046
5047 net_location = ''
5048 if iri_parts.username:
f9934b96 5049 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5050 if iri_parts.password is not None:
f9934b96 5051 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5052 net_location += '@'
5053
0f06bcd7 5054 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5055 # The 'idna' encoding produces ASCII text.
5056 if iri_parts.port is not None and iri_parts.port != 80:
5057 net_location += ':' + str(iri_parts.port)
5058
f9934b96 5059 return urllib.parse.urlunparse(
732044af 5060 (iri_parts.scheme,
5061 net_location,
5062
f9934b96 5063 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5064
5065 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5066 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5067
5068 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5069 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5070
f9934b96 5071 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5072
5073 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5074
5075
5076def to_high_limit_path(path):
5077 if sys.platform in ['win32', 'cygwin']:
5078 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5079 return '\\\\?\\' + os.path.abspath(path)
732044af 5080
5081 return path
76d321f6 5082
c76eb41b 5083
7b2c3f47 5084def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5085 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5086 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5087 return default
7b2c3f47 5088 return template % func(val)
00dd0cd5 5089
5090
5091def clean_podcast_url(url):
5092 return re.sub(r'''(?x)
5093 (?:
5094 (?:
5095 chtbl\.com/track|
5096 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5097 play\.podtrac\.com
5098 )/[^/]+|
5099 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5100 flex\.acast\.com|
5101 pd(?:
5102 cn\.co| # https://podcorn.com/analytics-prefix/
5103 st\.fm # https://podsights.com/docs/
5104 )/e
5105 )/''', '', url)
ffcb8191
THD
5106
5107
5108_HEX_TABLE = '0123456789abcdef'
5109
5110
5111def random_uuidv4():
5112 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5113
5114
5115def make_dir(path, to_screen=None):
5116 try:
5117 dn = os.path.dirname(path)
5118 if dn and not os.path.exists(dn):
5119 os.makedirs(dn)
5120 return True
86e5f3ed 5121 except OSError as err:
0202b52a 5122 if callable(to_screen) is not None:
5123 to_screen('unable to create directory ' + error_to_compat_str(err))
5124 return False
f74980cb 5125
5126
5127def get_executable_path():
b5899f4f 5128 from .update import _get_variant_and_executable_path
c487cf00 5129
b5899f4f 5130 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5131
5132
2f567473 5133def load_plugins(name, suffix, namespace):
3ae5e797 5134 classes = {}
19a03940 5135 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5136 plugins_spec = importlib.util.spec_from_file_location(
5137 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5138 plugins = importlib.util.module_from_spec(plugins_spec)
5139 sys.modules[plugins_spec.name] = plugins
5140 plugins_spec.loader.exec_module(plugins)
f74980cb 5141 for name in dir(plugins):
2f567473 5142 if name in namespace:
5143 continue
5144 if not name.endswith(suffix):
f74980cb 5145 continue
5146 klass = getattr(plugins, name)
3ae5e797 5147 classes[name] = namespace[name] = klass
f74980cb 5148 return classes
06167fbb 5149
5150
325ebc17 5151def traverse_obj(
352d63fd 5152 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5153 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5154 ''' Traverse nested list/dict/tuple
8f334380 5155 @param path_list A list of paths which are checked one by one.
19a03940 5156 Each path is a list of keys where each key is a:
5157 - None: Do nothing
5158 - string: A dictionary key
5159 - int: An index into a list
5160 - tuple: A list of keys all of which will be traversed
5161 - Ellipsis: Fetch all values in the object
5162 - Function: Takes the key and value as arguments
5163 and returns whether the key matches or not
325ebc17 5164 @param default Default value to return
352d63fd 5165 @param expected_type Only accept final value of this type (Can also be any callable)
5166 @param get_all Return all the values obtained from a path or only the first one
324ad820 5167 @param casesense Whether to consider dictionary keys as case sensitive
5168 @param is_user_input Whether the keys are generated from user input. If True,
5169 strings are converted to int/slice if necessary
5170 @param traverse_string Whether to traverse inside strings. If True, any
5171 non-compatible object will also be converted into a string
8f334380 5172 # TODO: Write tests
324ad820 5173 '''
325ebc17 5174 if not casesense:
dbf5416a 5175 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5176 path_list = (map(_lower, variadic(path)) for path in path_list)
5177
5178 def _traverse_obj(obj, path, _current_depth=0):
5179 nonlocal depth
5180 path = tuple(variadic(path))
5181 for i, key in enumerate(path):
1797b073 5182 if None in (key, obj):
5183 return obj
8f334380 5184 if isinstance(key, (list, tuple)):
5185 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5186 key = ...
5187 if key is ...:
5188 obj = (obj.values() if isinstance(obj, dict)
5189 else obj if isinstance(obj, (list, tuple, LazyList))
5190 else str(obj) if traverse_string else [])
5191 _current_depth += 1
5192 depth = max(depth, _current_depth)
5193 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5194 elif callable(key):
5195 if isinstance(obj, (list, tuple, LazyList)):
5196 obj = enumerate(obj)
5197 elif isinstance(obj, dict):
5198 obj = obj.items()
5199 else:
5200 if not traverse_string:
5201 return None
5202 obj = str(obj)
5203 _current_depth += 1
5204 depth = max(depth, _current_depth)
e6f868a6 5205 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 5206 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5207 obj = (obj.get(key) if casesense or (key in obj)
5208 else next((v for k, v in obj.items() if _lower(k) == key), None))
5209 else:
5210 if is_user_input:
5211 key = (int_or_none(key) if ':' not in key
5212 else slice(*map(int_or_none, key.split(':'))))
8f334380 5213 if key == slice(None):
575e17a1 5214 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5215 if not isinstance(key, (int, slice)):
9fea350f 5216 return None
8f334380 5217 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5218 if not traverse_string:
5219 return None
5220 obj = str(obj)
5221 try:
5222 obj = obj[key]
5223 except IndexError:
324ad820 5224 return None
325ebc17 5225 return obj
5226
352d63fd 5227 if isinstance(expected_type, type):
5228 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5229 else:
7b2c3f47 5230 type_test = expected_type or IDENTITY
352d63fd 5231
8f334380 5232 for path in path_list:
5233 depth = 0
5234 val = _traverse_obj(obj, path)
325ebc17 5235 if val is not None:
8f334380 5236 if depth:
5237 for _ in range(depth - 1):
6586bca9 5238 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5239 val = [v for v in map(type_test, val) if v is not None]
8f334380 5240 if val:
352d63fd 5241 return val if get_all else val[0]
5242 else:
5243 val = type_test(val)
5244 if val is not None:
8f334380 5245 return val
325ebc17 5246 return default
324ad820 5247
5248
5249def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5250 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5251 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5252 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5253
5254
ff91cf74 5255def get_first(obj, keys, **kwargs):
5256 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5257
5258
4b4b7f74 5259def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5260 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5261
5262
3e9b66d7
LNO
5263def time_seconds(**kwargs):
5264 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5265 return t.timestamp()
5266
5267
49fa4d9a
N
5268# create a JSON Web Signature (jws) with HS256 algorithm
5269# the resulting format is in JWS Compact Serialization
5270# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5271# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5272def jwt_encode_hs256(payload_data, key, headers={}):
5273 header_data = {
5274 'alg': 'HS256',
5275 'typ': 'JWT',
5276 }
5277 if headers:
5278 header_data.update(headers)
0f06bcd7 5279 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5280 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5281 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5282 signature_b64 = base64.b64encode(h.digest())
5283 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5284 return token
819e0531 5285
5286
16b0d7e6 5287# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5288def jwt_decode_hs256(jwt):
5289 header_b64, payload_b64, signature_b64 = jwt.split('.')
5290 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5291 return payload_data
5292
5293
53973b4d 5294WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5295
5296
0b9c08b4 5297@functools.cache
819e0531 5298def supports_terminal_sequences(stream):
5299 if compat_os_name == 'nt':
8a82af35 5300 if not WINDOWS_VT_MODE:
819e0531 5301 return False
5302 elif not os.getenv('TERM'):
5303 return False
5304 try:
5305 return stream.isatty()
5306 except BaseException:
5307 return False
5308
5309
53973b4d 5310def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5311 if get_windows_version() < (10, 0, 10586):
53973b4d 5312 return
5313 global WINDOWS_VT_MODE
53973b4d 5314 try:
f0c9fb96 5315 Popen.run('', shell=True)
53973b4d 5316 except Exception:
5317 return
5318
5319 WINDOWS_VT_MODE = True
5320 supports_terminal_sequences.cache_clear()
5321
5322
ec11a9f4 5323_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5324
5325
5326def remove_terminal_sequences(string):
5327 return _terminal_sequences_re.sub('', string)
5328
5329
5330def number_of_digits(number):
5331 return len('%d' % number)
34921b43 5332
5333
5334def join_nonempty(*values, delim='-', from_dict=None):
5335 if from_dict is not None:
7b2c3f47 5336 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5337 return delim.join(map(str, filter(None, values)))
06e57990 5338
5339
27231526
ZM
5340def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5341 """
5342 Find the largest format dimensions in terms of video width and, for each thumbnail:
5343 * Modify the URL: Match the width with the provided regex and replace with the former width
5344 * Update dimensions
5345
5346 This function is useful with video services that scale the provided thumbnails on demand
5347 """
5348 _keys = ('width', 'height')
5349 max_dimensions = max(
86e5f3ed 5350 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5351 default=(0, 0))
5352 if not max_dimensions[0]:
5353 return thumbnails
5354 return [
5355 merge_dicts(
5356 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5357 dict(zip(_keys, max_dimensions)), thumbnail)
5358 for thumbnail in thumbnails
5359 ]
5360
5361
93c8410d
LNO
5362def parse_http_range(range):
5363 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5364 if not range:
5365 return None, None, None
5366 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5367 if not crg:
5368 return None, None, None
5369 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5370
5371
6b9e832d 5372def read_stdin(what):
5373 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5374 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5375 return sys.stdin
5376
5377
06e57990 5378class Config:
5379 own_args = None
9e491463 5380 parsed_args = None
06e57990 5381 filename = None
5382 __initialized = False
5383
5384 def __init__(self, parser, label=None):
9e491463 5385 self.parser, self.label = parser, label
06e57990 5386 self._loaded_paths, self.configs = set(), []
5387
5388 def init(self, args=None, filename=None):
5389 assert not self.__initialized
65662dff 5390 directory = ''
06e57990 5391 if filename:
5392 location = os.path.realpath(filename)
65662dff 5393 directory = os.path.dirname(location)
06e57990 5394 if location in self._loaded_paths:
5395 return False
5396 self._loaded_paths.add(location)
5397
9e491463 5398 self.own_args, self.__initialized = args, True
5399 opts, _ = self.parser.parse_known_args(args)
5400 self.parsed_args, self.filename = args, filename
5401
5402 for location in opts.config_locations or []:
6b9e832d 5403 if location == '-':
5404 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5405 continue
65662dff 5406 location = os.path.join(directory, expand_path(location))
06e57990 5407 if os.path.isdir(location):
5408 location = os.path.join(location, 'yt-dlp.conf')
5409 if not os.path.exists(location):
9e491463 5410 self.parser.error(f'config location {location} does not exist')
06e57990 5411 self.append_config(self.read_file(location), location)
5412 return True
5413
5414 def __str__(self):
5415 label = join_nonempty(
5416 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5417 delim=' ')
5418 return join_nonempty(
5419 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5420 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5421 delim='\n')
5422
5423 @staticmethod
5424 def read_file(filename, default=[]):
5425 try:
5426 optionf = open(filename)
86e5f3ed 5427 except OSError:
06e57990 5428 return default # silently skip if file is not present
5429 try:
5430 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5431 contents = optionf.read()
f9934b96 5432 res = shlex.split(contents, comments=True)
44a6fcff 5433 except Exception as err:
5434 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5435 finally:
5436 optionf.close()
5437 return res
5438
5439 @staticmethod
5440 def hide_login_info(opts):
86e5f3ed 5441 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5442 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5443
5444 def _scrub_eq(o):
5445 m = eqre.match(o)
5446 if m:
5447 return m.group('key') + '=PRIVATE'
5448 else:
5449 return o
5450
5451 opts = list(map(_scrub_eq, opts))
5452 for idx, opt in enumerate(opts):
5453 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5454 opts[idx + 1] = 'PRIVATE'
5455 return opts
5456
5457 def append_config(self, *args, label=None):
9e491463 5458 config = type(self)(self.parser, label)
06e57990 5459 config._loaded_paths = self._loaded_paths
5460 if config.init(*args):
5461 self.configs.append(config)
5462
5463 @property
5464 def all_args(self):
5465 for config in reversed(self.configs):
5466 yield from config.all_args
9e491463 5467 yield from self.parsed_args or []
5468
5469 def parse_known_args(self, **kwargs):
5470 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5471
5472 def parse_args(self):
9e491463 5473 return self.parser.parse_args(self.all_args)
da42679b
LNO
5474
5475
5476class WebSocketsWrapper():
5477 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5478 pool = None
da42679b 5479
3cea3edd 5480 def __init__(self, url, headers=None, connect=True):
059bc4db 5481 self.loop = asyncio.new_event_loop()
9cd08050 5482 # XXX: "loop" is deprecated
5483 self.conn = websockets.connect(
5484 url, extra_headers=headers, ping_interval=None,
5485 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5486 if connect:
5487 self.__enter__()
15dfb392 5488 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5489
5490 def __enter__(self):
3cea3edd 5491 if not self.pool:
9cd08050 5492 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5493 return self
5494
5495 def send(self, *args):
5496 self.run_with_loop(self.pool.send(*args), self.loop)
5497
5498 def recv(self, *args):
5499 return self.run_with_loop(self.pool.recv(*args), self.loop)
5500
5501 def __exit__(self, type, value, traceback):
5502 try:
5503 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5504 finally:
5505 self.loop.close()
15dfb392 5506 self._cancel_all_tasks(self.loop)
da42679b
LNO
5507
5508 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5509 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5510 @staticmethod
5511 def run_with_loop(main, loop):
059bc4db 5512 if not asyncio.iscoroutine(main):
da42679b
LNO
5513 raise ValueError(f'a coroutine was expected, got {main!r}')
5514
5515 try:
5516 return loop.run_until_complete(main)
5517 finally:
5518 loop.run_until_complete(loop.shutdown_asyncgens())
5519 if hasattr(loop, 'shutdown_default_executor'):
5520 loop.run_until_complete(loop.shutdown_default_executor())
5521
5522 @staticmethod
5523 def _cancel_all_tasks(loop):
059bc4db 5524 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5525
5526 if not to_cancel:
5527 return
5528
5529 for task in to_cancel:
5530 task.cancel()
5531
9cd08050 5532 # XXX: "loop" is removed in python 3.10+
da42679b 5533 loop.run_until_complete(
059bc4db 5534 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5535
5536 for task in to_cancel:
5537 if task.cancelled():
5538 continue
5539 if task.exception() is not None:
5540 loop.call_exception_handler({
5541 'message': 'unhandled exception during asyncio.run() shutdown',
5542 'exception': task.exception(),
5543 'task': task,
5544 })
5545
5546
8b7539d2 5547def merge_headers(*dicts):
08d30158 5548 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5549 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5550
5551
5552class classproperty:
c487cf00 5553 """classmethod(property(func)) that works in py < 3.9"""
5554
5555 def __init__(self, func):
5556 functools.update_wrapper(self, func)
5557 self.func = func
28787f16 5558
5559 def __get__(self, _, cls):
c487cf00 5560 return self.func(cls)
19a03940 5561
5562
64fa820c 5563class Namespace(types.SimpleNamespace):
591bb9d3 5564 """Immutable namespace"""
591bb9d3 5565
7896214c 5566 def __iter__(self):
64fa820c 5567 return iter(self.__dict__.values())
7896214c 5568
64fa820c 5569 @property
5570 def items_(self):
5571 return self.__dict__.items()
9b8ee23b 5572
5573
5574# Deprecated
5575has_certifi = bool(certifi)
5576has_websockets = bool(websockets)